From 3c60209d88a74614e4ce294e07ca3483a92cc3da Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Thu, 28 Oct 2021 22:06:04 -0700
Subject: [PATCH 01/16] LUCENE-10061: Implements basic dynamic pruning support
 for CombinedFieldsQuery

// HighMed: from office # freq=3224339 freq=225338
// top scores time usage	425 milliseconds
// complete time usage		401 milliseconds

// HighHigh: but publisher # freq=1456553 freq=1289029
// top scores time usage	469 milliseconds
// complete time usage		322 milliseconds

// HighLow: with fung # freq=3709421 freq=1344
// top scores time usage	241 milliseconds
// complete time usage		428 milliseconds

// HighLow: date insult # freq=2020626 freq=4424
// top scores time usage	171 milliseconds
// complete time usage		225 milliseconds
---
 .../randomization/policies/tests.policy       |   5 +
 .../sandbox/search/CombinedFieldQuery.java    | 407 +++++++++++++++++-
 .../search/TestCombinedFieldQuery.java        |  81 ++++
 3 files changed, 483 insertions(+), 10 deletions(-)

diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy
index 71dc7991fdb..178fc6bbfa5 100644
--- a/gradle/testing/randomization/policies/tests.policy
+++ b/gradle/testing/randomization/policies/tests.policy
@@ -121,6 +121,11 @@ grant codeBase "file:${gradle.lib.dir}${/}-" {
   permission java.security.AllPermission;
 };
 
+// nocommit - hack to run test on indices stored elsewhere
+grant {
+  permission java.security.AllPermission;
+};
+
 grant codeBase "file:${gradle.worker.jar}" {
   permission java.security.AllPermission;
 };
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index d3187a0896e..eda88ba5472 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -20,15 +20,24 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.stream.Collectors;
+
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Impact;
+import org.apache.lucene.index.Impacts;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.ImpactsSource;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
@@ -44,6 +53,7 @@
 import org.apache.lucene.search.DisjunctionDISIApproximation;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.ImpactsDISI;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.LeafSimScorer;
 import org.apache.lucene.search.Matches;
@@ -61,6 +71,7 @@
 import org.apache.lucene.search.similarities.SimilarityBase;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.SmallFloat;
 
@@ -94,8 +105,17 @@
  * @lucene.experimental
  */
 public final class CombinedFieldQuery extends Query implements Accountable {
+  /** Cache of decoded norms. */
+  private static final float[] LENGTH_TABLE = new float[256];
+
+  static {
+    for (int i = 0; i < 256; i++) {
+      LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
+    }
+  }
+
   private static final long BASE_RAM_BYTES =
-      RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class);
+          RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class);
 
   /** A builder for {@link CombinedFieldQuery}. */
   public static class Builder {
@@ -319,16 +339,19 @@ class CombinedFieldWeight extends Weight {
     private final IndexSearcher searcher;
     private final TermStates[] termStates;
     private final Similarity.SimScorer simWeight;
+    private final ScoreMode scoreMode;
 
     CombinedFieldWeight(Query query, IndexSearcher searcher, ScoreMode scoreMode, float boost)
         throws IOException {
       super(query);
       assert scoreMode.needsScores();
+      this.scoreMode = scoreMode;
       this.searcher = searcher;
       long docFreq = 0;
       long totalTermFreq = 0;
       termStates = new TermStates[fieldTerms.length];
-      for (int i = 0; i < termStates.length; i++) {
+
+      for (int i = 0; i < fieldTerms.length; i++) {
         FieldAndWeight field = fieldAndWeights.get(fieldTerms[i].field());
         TermStates ts = TermStates.build(searcher.getTopReaderContext(), fieldTerms[i], true);
         termStates[i] = ts;
@@ -356,6 +379,7 @@ private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher)
       long docCount = 0;
       long sumTotalTermFreq = 0;
       long sumDocFreq = 0;
+
       for (FieldAndWeight fieldWeight : fieldAndWeights.values()) {
         CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field);
         if (collectionStats != null) {
@@ -402,14 +426,25 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
     public Scorer scorer(LeafReaderContext context) throws IOException {
       List<PostingsEnum> iterators = new ArrayList<>();
       List<FieldAndWeight> fields = new ArrayList<>();
+      Map<String, List<ImpactsEnum>> fieldImpacts = new HashMap<>();
+
       for (int i = 0; i < fieldTerms.length; i++) {
         TermState state = termStates[i].get(context);
         if (state != null) {
-          TermsEnum termsEnum = context.reader().terms(fieldTerms[i].field()).iterator();
+          String fieldName = fieldTerms[i].field();
+          fields.add(fieldAndWeights.get(fieldName));
+          fieldImpacts.putIfAbsent(fieldName, new ArrayList<>());
+
+          TermsEnum termsEnum = context.reader().terms(fieldName).iterator();
           termsEnum.seekExact(fieldTerms[i].bytes(), state);
-          PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS);
-          iterators.add(postingsEnum);
-          fields.add(fieldAndWeights.get(fieldTerms[i].field()));
+          if (scoreMode == ScoreMode.TOP_SCORES) {
+            ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
+            iterators.add(impactsEnum);
+            fieldImpacts.get(fieldName).add(impactsEnum);
+          } else {
+            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS);
+            iterators.add(postingsEnum);
+          }
         }
       }
 
@@ -421,18 +456,33 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
           new MultiNormsLeafSimScorer(simWeight, context.reader(), fields, true);
       LeafSimScorer nonScoringSimScorer =
           new LeafSimScorer(simWeight, context.reader(), "pseudo_field", false);
+
       // we use termscorers + disjunction as an impl detail
       DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size());
+      Map<String, Float> fieldWeights = new HashMap<>();
       for (int i = 0; i < iterators.size(); i++) {
-        float weight = fields.get(i).weight;
+        FieldAndWeight fieldAndWeight = fields.get(i);
+        if (fieldWeights.containsKey(fieldAndWeight.field)) {
+          assert fieldWeights.get(fieldAndWeight.field).equals(fieldAndWeight.weight);
+        } else {
+          fieldWeights.put(fieldAndWeight.field, fieldAndWeight.weight);
+        }
         queue.add(
             new WeightedDisiWrapper(
-                new TermScorer(this, iterators.get(i), nonScoringSimScorer), weight));
+                new TermScorer(this, iterators.get(i), nonScoringSimScorer), fieldAndWeight.weight));
       }
+
       // Even though it is called approximation, it is accurate since none of
       // the sub iterators are two-phase iterators.
       DocIdSetIterator iterator = new DisjunctionDISIApproximation(queue);
-      return new CombinedFieldScorer(this, queue, iterator, scoringSimScorer);
+      ImpactsDISI impactsDisi = null;
+
+      if (scoreMode == ScoreMode.TOP_SCORES) {
+        ImpactsSource impactsSource = mergeImpacts(fieldImpacts, fieldWeights);
+        iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight);
+      }
+
+      return new CombinedFieldScorer(this, queue, iterator, impactsDisi, scoringSimScorer);
     }
 
     @Override
@@ -441,6 +491,318 @@ public boolean isCacheable(LeafReaderContext ctx) {
     }
   }
 
+  /** Merge impacts for combined field. */
+  static ImpactsSource mergeImpacts(Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums, Map<String, Float> fieldWeights) {
+    return new ImpactsSource() {
+
+      class SubIterator {
+        final Iterator<Impact> iterator;
+        int previousFreq;
+        Impact current;
+
+        SubIterator(Iterator<Impact> iterator) {
+          this.iterator = iterator;
+          this.current = iterator.next();
+        }
+
+        void next() {
+          previousFreq = current.freq;
+          if (iterator.hasNext() == false) {
+            current = null;
+          } else {
+            current = iterator.next();
+          }
+        }
+      }
+
+      Map<String, List<Impacts>> fieldsWithImpacts;
+
+      @Override
+      public Impacts getImpacts() throws IOException {
+        fieldsWithImpacts = new HashMap<>();
+
+        // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for each field
+        // They collectively will decide on the number of levels and the block boundaries.
+        Map<String, Impacts> leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.keySet().size());
+
+        for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts : fieldsWithImpactsEnums.entrySet()) {
+          String field = fieldImpacts.getKey();
+          List<ImpactsEnum> impactsEnums = fieldImpacts.getValue();
+          fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size()));
+
+          Impacts tmpLead = null;
+          // find the impact that has the lowest next boundary for this field
+          for (int i = 0; i < impactsEnums.size(); ++i) {
+            Impacts impacts = impactsEnums.get(i).getImpacts();
+            fieldsWithImpacts.get(field).add(impacts);
+
+            if (tmpLead == null ||
+                    impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
+              tmpLead = impacts;
+            }
+          }
+
+          leadingImpactsPerField.put(field, tmpLead);
+        }
+
+        return new Impacts() {
+
+          @Override
+          public int numLevels() {
+            // max of levels across fields' impactEnums
+            return leadingImpactsPerField.values().stream().map(Impacts::numLevels).max(Integer::compareTo).get();
+          }
+
+          @Override
+          public int getDocIdUpTo(int level) {
+            // min of docIdUpTo across fields' impactEnums
+            return leadingImpactsPerField.values().stream().filter(i -> i.numLevels() > level).map(i -> i.getDocIdUpTo(level)).min(Integer::compareTo).get();
+          }
+
+          @Override
+          public List<Impact> getImpacts(int level) {
+            final int docIdUpTo = getDocIdUpTo(level);
+            final Map<String, List<Impact>> mergedImpactsPerField = mergeImpactsPerField(docIdUpTo);
+
+            return mergeImpactsAcrossFields(mergedImpactsPerField);
+          }
+
+          private Map<String, List<Impact>> mergeImpactsPerField(int docIdUpTo) {
+            final Map<String, List<Impact>> result = new HashMap<>();
+
+            for (Map.Entry<String, List<ImpactsEnum>> impactsPerField : fieldsWithImpactsEnums.entrySet()) {
+              String field = impactsPerField.getKey();
+              List<ImpactsEnum> impactsEnums = impactsPerField.getValue();
+              List<Impacts> impacts = fieldsWithImpacts.get(field);
+
+              List<Impact> mergedImpacts = doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo);
+
+              if (mergedImpacts.size() == 0) {
+                // all impactEnums for this field were positioned beyond docIdUpTo, continue to next field
+                continue;
+              } else if (mergedImpacts.size() == 1 && mergedImpacts.get(0).freq == Integer.MAX_VALUE && mergedImpacts.get(0).norm == 1L) {
+                // one field gets impacts that trigger maximum score, pass it up
+                return Collections.singletonMap(field, mergedImpacts);
+              } else {
+                result.put(field, mergedImpacts);
+              }
+            }
+
+            return result;
+          }
+
+
+          // Merge impacts from same field by summing freqs with the same norms - the same logic used for SynonymQuery
+          private List<Impact> doMergeImpactsPerField(String field, List<ImpactsEnum> impactsEnums, List<Impacts> impacts, int docIdUpTo) {
+            List<List<Impact>> toMerge = new ArrayList<>();
+
+            for (int i = 0; i < impactsEnums.size(); ++i) {
+              if (impactsEnums.get(i).docID() <= docIdUpTo) {
+                int impactsLevel = getLevel(impacts.get(i), docIdUpTo);
+                if (impactsLevel == -1) {
+                  // one instance doesn't have impacts that cover up to docIdUpTo
+                  // return impacts that trigger the maximum score
+                  return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
+                }
+                final List<Impact> impactList;
+                float weight = fieldWeights.get(field);
+                if (weight != 1f) {
+                  impactList =
+                          impacts.get(i).getImpacts(impactsLevel).stream()
+                                  .map(
+                                          impact ->
+                                                  new Impact((int) Math.ceil(impact.freq * weight),
+                                                          SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight))))
+                                  .collect(Collectors.toList());
+                } else {
+                  impactList = impacts.get(i).getImpacts(impactsLevel);
+                }
+                toMerge.add(impactList);
+              }
+            }
+
+            // all impactEnums for this field were positioned beyond docIdUpTo
+            if (toMerge.size() == 0) {
+              return new ArrayList<>();
+            }
+
+            if (toMerge.size() == 1) {
+              // common if one term is common and the other one is rare
+              return toMerge.get(0);
+            }
+
+            PriorityQueue<SubIterator> pq =
+                    new PriorityQueue<SubIterator>(impacts.size()) {
+                      @Override
+                      protected boolean lessThan(SubIterator a, SubIterator b) {
+                        if (a.current == null) { // means iteration is finished
+                          return false;
+                        }
+                        if (b.current == null) {
+                          return true;
+                        }
+                        return Long.compareUnsigned(a.current.norm, b.current.norm) < 0;
+                      }
+                    };
+
+            for (List<Impact> toMergeImpacts : toMerge) {
+              pq.add(new SubIterator(toMergeImpacts.iterator()));
+            }
+
+            List<Impact> mergedImpacts = new LinkedList<>();
+
+            // Idea: merge impacts by norm. The tricky thing is that we need to
+            // consider norm values that are not in the impacts too. For
+            // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
+            // there might well be a document that has a freq of 2 and a length of 11,
+            // which was just not added to the list of impacts because {freq=2,norm=10}
+            // is more competitive. So the way it works is that we track the sum of
+            // the term freqs that we have seen so far in order to account for these
+            // implicit impacts.
+
+            long sumTf = 0;
+            SubIterator top = pq.top();
+            do {
+              final long norm = top.current.norm;
+              do {
+                sumTf += top.current.freq - top.previousFreq;
+                top.next();
+                top = pq.updateTop();
+              } while (top.current != null && top.current.norm == norm);
+
+              final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf);
+              if (mergedImpacts.isEmpty()) {
+                mergedImpacts.add(new Impact(freqUpperBound, norm));
+              } else {
+                Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1);
+                assert Long.compareUnsigned(prevImpact.norm, norm) < 0;
+                if (freqUpperBound > prevImpact.freq) {
+                  mergedImpacts.add(new Impact(freqUpperBound, norm));
+                } // otherwise the previous impact is already more competitive
+              }
+            } while (top.current != null);
+
+            return mergedImpacts;
+          }
+
+          private int getLevel(Impacts impacts, int docIdUpTo) {
+            for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
+              if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
+                return level;
+              }
+            }
+            return -1;
+          }
+
+          private List<Impact> mergeImpactsAcrossFields(Map<String, List<Impact>> mergedImpactsPerField) {
+            if (mergedImpactsPerField.size() == 1) {
+              return mergedImpactsPerField.values().iterator().next();
+            }
+
+            List<List<Impact>> impactLists = new ArrayList<>(mergedImpactsPerField.size());
+            for (List<Impact> impacts : mergedImpactsPerField.values()) {
+              // add empty impact for cartesian product calculation
+              List<Impact> newList = new ArrayList<>(impacts.size() + 1);
+              newList.addAll(impacts);
+              newList.add(new Impact(0, 0));
+              impactLists.add(newList);
+            }
+
+            List<Impact> result = getCartesianProductOfImpacts(impactLists, 0);
+
+            Collections.sort(result, new Comparator<Impact>() {
+              @Override
+              public int compare(Impact o1, Impact o2) {
+                return (int) (o1.norm - o2.norm);
+              }
+            });
+
+            // remove the dummy Impact {0, 0} added above
+            result.remove(0);
+
+            return result;
+          }
+
+          // merge impacts across fields by doing cartesian products recursively, and only keep competitive ones.
+          // this way of merging impacts across fields mirror the scoring logic in MultiNormsLeafSimScorer#score, where norms were summed along with freqs
+          // this operation would be expensive for many fields with long impact lists
+          //
+          // e.g.
+          // 1. field a impacts: <freq_a_1, norm_a_1> <freq_a_2, norm_a_2>
+          // 2. field b impacts: <freq_b_1, norm_b_1> <freq_b_2, norm_b_2>
+          // 3. cartesian products of above:
+          //     <freq_a_1 + freq_b_1, norm_a_1 + norm_b_1>
+          //     <freq_a_1 + freq_b_2, norm_a_1 + norm_b_2>
+          //     <freq_a_2 + freq_b_1, norm_a_2 + norm_b_1>
+          //     <freq_a_2 + freq_b_2, norm_a_2 + norm_b_2>
+          // 4. keep only the competitive ones from #3
+          // 5. after step #4, treat the result as competitive impacts for merged fields a & b, and merge it with the rest of fields' impacts
+
+          private List<Impact> getCartesianProductOfImpacts(List<List<Impact>> values, int index) {
+            if (index == values.size() - 1) {
+              return values.get(values.size() - 1);
+            }
+
+            List<Impact> firstImpactList = values.get(index);
+
+            List<Impact> cartesianProductOfRestOfImpacts = getCartesianProductOfImpacts(values, index + 1);
+
+            Map<Long, Integer> maxFreq = new HashMap<>();
+
+            for (int i = 0; i < firstImpactList.size(); i++) {
+              for (int j = 0; j < cartesianProductOfRestOfImpacts.size(); j++) {
+                Impact impactA = firstImpactList.get(i);
+                Impact impactB = cartesianProductOfRestOfImpacts.get(j);
+
+                int tempNorm = (int) Math.floor(normToLength(impactA.norm) + normToLength(impactB.norm));
+                long normSum = SmallFloat.intToByte4(tempNorm);
+
+                long freqSum = (long) impactA.freq + (long) impactB.freq;
+                int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, freqSum);
+
+                if (maxFreq.containsKey(normSum) == false ||
+                          (maxFreq.containsKey(normSum) && freqUpperBound > maxFreq.get(normSum))) {
+                    maxFreq.put(normSum, freqUpperBound);
+                  }
+              }
+            }
+
+            // remove duplicate entries where freq is same, but norm larger
+//            Map<Integer, Long> reversedMaxFreq = new HashMap<>();
+//            for (Map.Entry<Long, Integer> normFreqPair : maxFreq.entrySet()) {
+//              Long norm = normFreqPair.getKey();
+//              Integer freq = normFreqPair.getValue();
+//
+//              if (reversedMaxFreq.containsKey(freq) == false ||
+//                      (reversedMaxFreq.containsKey(freq) && norm < reversedMaxFreq.get(freq))) {
+//                reversedMaxFreq.put(freq, norm);
+//              }
+//            }
+
+            return maxFreq.entrySet().stream().map(e -> new Impact(e.getValue(), e.getKey())).collect(Collectors.toList());
+          }
+
+        };
+      }
+
+      private float normToLength(long norm) {
+        return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
+      }
+
+      @Override
+      public void advanceShallow(int target) throws IOException {
+        for (List<ImpactsEnum> impactsEnums: fieldsWithImpactsEnums.values()) {
+          for (ImpactsEnum impactsEnum : impactsEnums) {
+            if (impactsEnum.docID() < target) {
+              impactsEnum.advanceShallow(target);
+            }
+          }
+        }
+      }
+    };
+  }
+
+
   private static class WeightedDisiWrapper extends DisiWrapper {
     final float weight;
 
@@ -458,15 +820,18 @@ private static class CombinedFieldScorer extends Scorer {
     private final DisiPriorityQueue queue;
     private final DocIdSetIterator iterator;
     private final MultiNormsLeafSimScorer simScorer;
+    private final ImpactsDISI impactsDISI;
 
     CombinedFieldScorer(
         Weight weight,
         DisiPriorityQueue queue,
         DocIdSetIterator iterator,
+        ImpactsDISI impactsDISI,
         MultiNormsLeafSimScorer simScorer) {
       super(weight);
       this.queue = queue;
       this.iterator = iterator;
+      this.impactsDISI = impactsDISI;
       this.simScorer = simScorer;
     }
 
@@ -499,7 +864,29 @@ public DocIdSetIterator iterator() {
 
     @Override
     public float getMaxScore(int upTo) throws IOException {
-      return Float.POSITIVE_INFINITY;
+      if (impactsDISI != null) {
+        return impactsDISI.getMaxScore(upTo);
+      } else {
+        return Float.POSITIVE_INFINITY;
+      }
+    }
+
+    @Override
+    public int advanceShallow(int target) throws IOException {
+      if (impactsDISI != null) {
+        return impactsDISI.advanceShallow(target);
+      } else {
+        return super.advanceShallow(target);
+      }
+    }
+
+    @Override
+    public void setMinCompetitiveScore(float minScore) throws IOException {
+      if (impactsDISI != null) {
+        impactsDISI.setMinCompetitiveScore(minScore);
+      } else {
+        super.setMinCompetitiveScore(minScore);
+      }
     }
   }
 }
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
index 7798a2a7017..4d5d3ac0b19 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
@@ -17,12 +17,18 @@
 package org.apache.lucene.sandbox.search;
 
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
 import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Arrays;
+import java.util.concurrent.TimeUnit;
+
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -46,6 +52,7 @@
 import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@@ -165,6 +172,80 @@ public void testSameScore() throws IOException {
     dir.close();
   }
 
+  public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOException {
+    Path path = Paths.get("/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index");
+
+    Directory index = FSDirectory.open(path);
+    IndexReader reader = DirectoryReader.open(index);
+    IndexSearcher searcher = new IndexSearcher(reader);
+
+    // HighHigh: but publisher # freq=1456553 freq=1289029
+//    CombinedFieldQuery query =
+//            new CombinedFieldQuery.Builder()
+//                    .addField("title", (float) 4.0)
+//                    .addField("body", (float) 2.0)
+//                    .addTerm(new BytesRef("but"))
+//                    .addTerm(new BytesRef("publisher"))
+//                    .build();
+
+    // HighMed: from office # freq=3224339 freq=225338
+//    CombinedFieldQuery query =
+//            new CombinedFieldQuery.Builder()
+//                    .addField("title", (float) 4.0)
+//                    .addField("body", (float) 2.0)
+//                    .addTerm(new BytesRef("from"))
+//                    .addTerm(new BytesRef("office"))
+//                    .build();
+
+    // HighLow: with fung # freq=3709421 freq=1344
+//    CombinedFieldQuery query =
+//            new CombinedFieldQuery.Builder()
+//                    .addField("title", (float) 4.0)
+//                    .addField("body", (float) 2.0)
+//                    .addTerm(new BytesRef("with"))
+//                    .addTerm(new BytesRef("fung"))
+//                    .build();
+
+    // HighLow: date insult # freq=2020626 freq=4424
+    CombinedFieldQuery query =
+            new CombinedFieldQuery.Builder()
+                    .addField("title", (float) 4.0)
+                    .addField("body", (float) 2.0)
+                    .addTerm(new BytesRef("date"))
+                    .addTerm(new BytesRef("insult"))
+                    .build();
+
+    long start;
+    long end;
+    long duration;
+    int runCount = 20;
+
+    TopScoreDocCollector topScoresCollector = null;
+    start = System.nanoTime();
+    for (int i = 0 ; i < runCount ; i++) {
+      topScoresCollector = TopScoreDocCollector.create(100, null, 100); // TOP_SCORES
+      searcher.search(query, topScoresCollector);
+    }
+    end = System.nanoTime();
+    duration = (end - start) / runCount;
+    System.out.println("// top scores time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds");
+
+
+    TopScoreDocCollector completeCollector = null;
+    start = System.nanoTime();
+    for (int i = 0 ; i < runCount ; i++) {
+      completeCollector = TopScoreDocCollector.create(100, null, Integer.MAX_VALUE); // COMPLETE
+      searcher.search(query, completeCollector);
+    }
+    end = System.nanoTime();
+    duration = (end - start) / runCount;
+    System.out.println("// complete time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds");
+
+    CheckHits.checkEqual(query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs);
+
+    reader.close();
+  }
+
   public void testNormsDisabled() throws IOException {
     Directory dir = newDirectory();
     Similarity similarity = randomCompatibleSimilarity();

From 2ba435e5c83f870be95662c951c9818111843a59 Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Wed, 3 Nov 2021 23:28:25 -0700
Subject: [PATCH 02/16] replace cartesian product calculation with simple
 upper-bound
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Perf tests result
Run 1:
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                     CFQHighHigh        3.53      (3.3%)        2.92      (5.3%)  -17.2% ( -25% -   -8%) 0.000
                        PKLookup      108.13      (7.7%)      119.85      (8.2%)   10.8% (  -4% -   28%) 0.000
                      CFQHighLow       14.88      (3.9%)       16.69     (12.5%)   12.2% (  -3% -   29%) 0.000
                      CFQHighMed       21.11      (4.1%)       25.87     (11.8%)   22.6% (   6% -   40%) 0.000

Run 2:
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                     CFQHighHigh        6.64      (3.1%)        5.63     (10.2%)  -15.2% ( -27% -   -1%) 0.000
                      CFQHighLow        8.35      (2.8%)        8.05     (15.0%)   -3.6% ( -20% -   14%) 0.297
                      CFQHighMed       24.51      (5.3%)       24.90     (19.9%)    1.6% ( -22% -   28%) 0.733
                        PKLookup      110.06     (10.0%)      128.54      (7.9%)   16.8% (  -1% -   38%) 0.000

Run 3:
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                      CFQHighMed       13.01      (2.9%)        9.82      (7.8%)  -24.5% ( -34% -  -14%) 0.000
                        PKLookup      107.85      (8.1%)      111.79     (11.2%)    3.7% ( -14% -   24%) 0.236
                     CFQHighHigh        4.83      (2.6%)        5.06      (8.6%)    4.7% (  -6% -   16%) 0.018
                      CFQHighLow       14.95      (3.0%)       18.31     (19.0%)   22.5% (   0% -   45%) 0.000

Run 4:
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                      CFQHighMed       11.11      (2.9%)        6.69      (4.1%)  -39.7% ( -45% -  -33%) 0.000
                      CFQHighLow       27.55      (3.8%)       25.46     (11.0%)   -7.6% ( -21% -    7%) 0.003
                     CFQHighHigh        5.25      (3.2%)        4.96      (6.1%)   -5.7% ( -14% -    3%) 0.000
                        PKLookup      107.61      (6.7%)      121.19      (4.6%)   12.6% (   1% -   25%) 0.000
---
 .../sandbox/search/CombinedFieldQuery.java    | 88 ++-----------------
 1 file changed, 9 insertions(+), 79 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index eda88ba5472..144f060329a 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -20,7 +20,6 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -699,88 +698,20 @@ private List<Impact> mergeImpactsAcrossFields(Map<String, List<Impact>> mergedIm
               return mergedImpactsPerField.values().iterator().next();
             }
 
-            List<List<Impact>> impactLists = new ArrayList<>(mergedImpactsPerField.size());
+            // upper-bound by creating an impact that should be most competitive: <maxFreq * numOfFields, minNorm>
+            // this is done to avoid the potential combinatorial explosion from accurate computation on merged impacts across fields
+            int maxFreq = 0;
+            long minNorm = Long.MIN_VALUE;
             for (List<Impact> impacts : mergedImpactsPerField.values()) {
-              // add empty impact for cartesian product calculation
-              List<Impact> newList = new ArrayList<>(impacts.size() + 1);
-              newList.addAll(impacts);
-              newList.add(new Impact(0, 0));
-              impactLists.add(newList);
+              // highest freq at the end of each impact list
+              maxFreq = Math.max(maxFreq, impacts.get(impacts.size() - 1).freq);
+              // lowest norm at the start of each impact list
+              minNorm = Math.min(minNorm, impacts.get(0).norm);
             }
 
-            List<Impact> result = getCartesianProductOfImpacts(impactLists, 0);
-
-            Collections.sort(result, new Comparator<Impact>() {
-              @Override
-              public int compare(Impact o1, Impact o2) {
-                return (int) (o1.norm - o2.norm);
-              }
-            });
-
-            // remove the dummy Impact {0, 0} added above
-            result.remove(0);
-
-            return result;
+            return Collections.singletonList(new Impact(maxFreq * mergedImpactsPerField.size(), minNorm));
           }
 
-          // merge impacts across fields by doing cartesian products recursively, and only keep competitive ones.
-          // this way of merging impacts across fields mirror the scoring logic in MultiNormsLeafSimScorer#score, where norms were summed along with freqs
-          // this operation would be expensive for many fields with long impact lists
-          //
-          // e.g.
-          // 1. field a impacts: <freq_a_1, norm_a_1> <freq_a_2, norm_a_2>
-          // 2. field b impacts: <freq_b_1, norm_b_1> <freq_b_2, norm_b_2>
-          // 3. cartesian products of above:
-          //     <freq_a_1 + freq_b_1, norm_a_1 + norm_b_1>
-          //     <freq_a_1 + freq_b_2, norm_a_1 + norm_b_2>
-          //     <freq_a_2 + freq_b_1, norm_a_2 + norm_b_1>
-          //     <freq_a_2 + freq_b_2, norm_a_2 + norm_b_2>
-          // 4. keep only the competitive ones from #3
-          // 5. after step #4, treat the result as competitive impacts for merged fields a & b, and merge it with the rest of fields' impacts
-
-          private List<Impact> getCartesianProductOfImpacts(List<List<Impact>> values, int index) {
-            if (index == values.size() - 1) {
-              return values.get(values.size() - 1);
-            }
-
-            List<Impact> firstImpactList = values.get(index);
-
-            List<Impact> cartesianProductOfRestOfImpacts = getCartesianProductOfImpacts(values, index + 1);
-
-            Map<Long, Integer> maxFreq = new HashMap<>();
-
-            for (int i = 0; i < firstImpactList.size(); i++) {
-              for (int j = 0; j < cartesianProductOfRestOfImpacts.size(); j++) {
-                Impact impactA = firstImpactList.get(i);
-                Impact impactB = cartesianProductOfRestOfImpacts.get(j);
-
-                int tempNorm = (int) Math.floor(normToLength(impactA.norm) + normToLength(impactB.norm));
-                long normSum = SmallFloat.intToByte4(tempNorm);
-
-                long freqSum = (long) impactA.freq + (long) impactB.freq;
-                int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, freqSum);
-
-                if (maxFreq.containsKey(normSum) == false ||
-                          (maxFreq.containsKey(normSum) && freqUpperBound > maxFreq.get(normSum))) {
-                    maxFreq.put(normSum, freqUpperBound);
-                  }
-              }
-            }
-
-            // remove duplicate entries where freq is same, but norm larger
-//            Map<Integer, Long> reversedMaxFreq = new HashMap<>();
-//            for (Map.Entry<Long, Integer> normFreqPair : maxFreq.entrySet()) {
-//              Long norm = normFreqPair.getKey();
-//              Integer freq = normFreqPair.getValue();
-//
-//              if (reversedMaxFreq.containsKey(freq) == false ||
-//                      (reversedMaxFreq.containsKey(freq) && norm < reversedMaxFreq.get(freq))) {
-//                reversedMaxFreq.put(freq, norm);
-//              }
-//            }
-
-            return maxFreq.entrySet().stream().map(e -> new Impact(e.getValue(), e.getKey())).collect(Collectors.toList());
-          }
 
         };
       }
@@ -802,7 +733,6 @@ public void advanceShallow(int target) throws IOException {
     };
   }
 
-
   private static class WeightedDisiWrapper extends DisiWrapper {
     final float weight;
 

From 1a71469bf7afa39a052dcb7cd8965ac6b5a96804 Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Thu, 4 Nov 2021 21:12:31 -0700
Subject: [PATCH 03/16] Bug fix and optimize per CPU profiler

Run 1
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                 CFQHighHighHigh        4.64      (6.5%)        3.30      (4.7%)  -29.0% ( -37% -  -19%) 0.000
                     CFQHighHigh       11.09      (6.0%)        9.61      (6.0%)  -13.3% ( -23% -   -1%) 0.000
                        PKLookup      103.38      (4.4%)      108.04      (4.3%)    4.5% (  -4% -   13%) 0.001
                   CFQHighMedLow       10.58      (6.1%)       12.30      (8.7%)   16.2% (   1% -   33%) 0.000
                      CFQHighMed       10.70      (7.4%)       15.51     (11.2%)   44.9% (  24% -   68%) 0.000
                   CFQHighLowLow        8.18      (8.2%)       12.87     (11.6%)   57.3% (  34% -   84%) 0.000
                      CFQHighLow       14.57      (7.5%)       30.81     (15.1%)  111.4% (  82% -  144%) 0.000

Run 2
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                 CFQHighHighHigh        5.33      (5.7%)        4.02      (7.7%)  -24.4% ( -35% -  -11%) 0.000
                   CFQHighLowLow       17.14      (6.2%)       13.06      (5.4%)  -23.8% ( -33% -  -13%) 0.000
                      CFQHighMed       17.37      (5.8%)       14.38      (7.7%)  -17.2% ( -29% -   -3%) 0.000
                        PKLookup      103.57      (5.5%)      108.84      (5.9%)    5.1% (  -6% -   17%) 0.005
                   CFQHighMedLow       11.25      (7.2%)       12.70      (9.0%)   12.9% (  -3% -   31%) 0.000
                     CFQHighHigh        5.00      (6.2%)        7.54     (12.1%)   51.0% (  30% -   73%) 0.000
                      CFQHighLow       21.60      (5.2%)       34.57     (14.1%)   60.0% (  38% -   83%) 0.000

Run 3
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                 CFQHighHighHigh        5.40      (6.9%)        4.06      (5.1%)  -24.8% ( -34% -  -13%) 0.000
                   CFQHighMedLow        7.64      (7.4%)        5.79      (6.3%)  -24.2% ( -35% -  -11%) 0.000
                     CFQHighHigh       11.11      (7.0%)        9.60      (5.9%)  -13.6% ( -24% -    0%) 0.000
                   CFQHighLowLow       21.21      (7.6%)       21.22      (6.6%)    0.0% ( -13% -   15%) 0.993
                        PKLookup      103.15      (5.9%)      107.60      (6.9%)    4.3% (  -8% -   18%) 0.034
                      CFQHighLow       21.85      (8.1%)       34.18     (13.5%)   56.4% (  32% -   84%) 0.000
                      CFQHighMed       12.07      (8.4%)       19.98     (16.7%)   65.5% (  37% -   98%) 0.000

Run 4
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                     CFQHighHigh        8.50      (5.8%)        6.85      (5.2%)  -19.5% ( -28% -   -8%) 0.000
                   CFQHighMedLow       10.89      (5.7%)        8.96      (5.4%)  -17.8% ( -27% -   -7%) 0.000
                      CFQHighMed        8.41      (5.8%)        7.74      (5.6%)   -7.9% ( -18% -    3%) 0.000
                 CFQHighHighHigh        3.45      (6.7%)        3.38      (5.3%)   -2.0% ( -13% -   10%) 0.287
                   CFQHighLowLow        7.82      (6.4%)        8.20      (7.5%)    4.8% (  -8% -   20%) 0.030
                        PKLookup      103.50      (5.0%)      110.69      (5.4%)    6.9% (  -3% -   18%) 0.000
                      CFQHighLow       11.46      (6.0%)       13.16      (6.7%)   14.8% (   1% -   29%) 0.000
---
 .../sandbox/search/CombinedFieldQuery.java    | 68 ++++++++++++-------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index 144f060329a..bfd97da2ba2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -29,7 +29,6 @@
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeMap;
-import java.util.stream.Collectors;
 
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
@@ -425,21 +424,23 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
     public Scorer scorer(LeafReaderContext context) throws IOException {
       List<PostingsEnum> iterators = new ArrayList<>();
       List<FieldAndWeight> fields = new ArrayList<>();
-      Map<String, List<ImpactsEnum>> fieldImpacts = new HashMap<>();
+      Map<String, List<ImpactsEnum>> fieldImpactsEnum = new HashMap<>();
+      Map<String, List<Impacts>> fieldImpacts = new HashMap<>();
 
       for (int i = 0; i < fieldTerms.length; i++) {
         TermState state = termStates[i].get(context);
         if (state != null) {
           String fieldName = fieldTerms[i].field();
           fields.add(fieldAndWeights.get(fieldName));
-          fieldImpacts.putIfAbsent(fieldName, new ArrayList<>());
+          fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>());
 
           TermsEnum termsEnum = context.reader().terms(fieldName).iterator();
           termsEnum.seekExact(fieldTerms[i].bytes(), state);
+
           if (scoreMode == ScoreMode.TOP_SCORES) {
             ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
             iterators.add(impactsEnum);
-            fieldImpacts.get(fieldName).add(impactsEnum);
+            fieldImpactsEnum.get(fieldName).add(impactsEnum);
           } else {
             PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS);
             iterators.add(postingsEnum);
@@ -477,7 +478,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
       ImpactsDISI impactsDisi = null;
 
       if (scoreMode == ScoreMode.TOP_SCORES) {
-        ImpactsSource impactsSource = mergeImpacts(fieldImpacts, fieldWeights);
+        ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights);
         iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight);
       }
 
@@ -491,7 +492,7 @@ public boolean isCacheable(LeafReaderContext ctx) {
   }
 
   /** Merge impacts for combined field. */
-  static ImpactsSource mergeImpacts(Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums, Map<String, Float> fieldWeights) {
+  static ImpactsSource mergeImpacts(Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums, Map<String, List<Impacts>> fieldsWithImpacts, Map<String, Float> fieldWeights) {
     return new ImpactsSource() {
 
       class SubIterator {
@@ -514,15 +515,12 @@ void next() {
         }
       }
 
-      Map<String, List<Impacts>> fieldsWithImpacts;
 
       @Override
       public Impacts getImpacts() throws IOException {
-        fieldsWithImpacts = new HashMap<>();
-
         // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for each field
         // They collectively will decide on the number of levels and the block boundaries.
-        Map<String, Impacts> leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.keySet().size());
+        Map<String, Impacts> leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size());
 
         for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts : fieldsWithImpactsEnums.entrySet()) {
           String field = fieldImpacts.getKey();
@@ -549,13 +547,27 @@ public Impacts getImpacts() throws IOException {
           @Override
           public int numLevels() {
             // max of levels across fields' impactEnums
-            return leadingImpactsPerField.values().stream().map(Impacts::numLevels).max(Integer::compareTo).get();
+            int result = 0;
+
+            for (Impacts impacts : leadingImpactsPerField.values()) {
+                result = Math.max(result, impacts.numLevels());
+            }
+
+            return result;
           }
 
           @Override
           public int getDocIdUpTo(int level) {
             // min of docIdUpTo across fields' impactEnums
-            return leadingImpactsPerField.values().stream().filter(i -> i.numLevels() > level).map(i -> i.getDocIdUpTo(level)).min(Integer::compareTo).get();
+            int result = Integer.MAX_VALUE;
+
+            for (Impacts impacts : leadingImpactsPerField.values()) {
+              if (impacts.numLevels() > level) {
+                result = Math.min(result, impacts.getDocIdUpTo(level));
+              }
+            }
+
+            return result;
           }
 
           @Override
@@ -567,7 +579,7 @@ public List<Impact> getImpacts(int level) {
           }
 
           private Map<String, List<Impact>> mergeImpactsPerField(int docIdUpTo) {
-            final Map<String, List<Impact>> result = new HashMap<>();
+            final Map<String, List<Impact>> result = new HashMap<>(fieldsWithImpactsEnums.size());
 
             for (Map.Entry<String, List<ImpactsEnum>> impactsPerField : fieldsWithImpactsEnums.entrySet()) {
               String field = impactsPerField.getKey();
@@ -593,7 +605,7 @@ private Map<String, List<Impact>> mergeImpactsPerField(int docIdUpTo) {
 
           // Merge impacts from same field by summing freqs with the same norms - the same logic used for SynonymQuery
           private List<Impact> doMergeImpactsPerField(String field, List<ImpactsEnum> impactsEnums, List<Impacts> impacts, int docIdUpTo) {
-            List<List<Impact>> toMerge = new ArrayList<>();
+            List<List<Impact>> toMerge = new ArrayList<>(impactsEnums.size());
 
             for (int i = 0; i < impactsEnums.size(); ++i) {
               if (impactsEnums.get(i).docID() <= docIdUpTo) {
@@ -603,20 +615,19 @@ private List<Impact> doMergeImpactsPerField(String field, List<ImpactsEnum> impa
                   // return impacts that trigger the maximum score
                   return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
                 }
-                final List<Impact> impactList;
                 float weight = fieldWeights.get(field);
                 if (weight != 1f) {
-                  impactList =
-                          impacts.get(i).getImpacts(impactsLevel).stream()
-                                  .map(
-                                          impact ->
-                                                  new Impact((int) Math.ceil(impact.freq * weight),
-                                                          SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight))))
-                                  .collect(Collectors.toList());
+                  final List<Impact> originalImpactList = impacts.get(i).getImpacts(impactsLevel);
+                  final List<Impact> impactList = new ArrayList<>(originalImpactList.size());
+                  for (Impact impact : originalImpactList) {
+                    impactList.add(new Impact((int) Math.ceil(impact.freq * weight),
+                            SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight))));
+
+                  }
+                  toMerge.add(impactList);
                 } else {
-                  impactList = impacts.get(i).getImpacts(impactsLevel);
+                  toMerge.add(impacts.get(i).getImpacts(impactsLevel));
                 }
-                toMerge.add(impactList);
               }
             }
 
@@ -709,7 +720,14 @@ private List<Impact> mergeImpactsAcrossFields(Map<String, List<Impact>> mergedIm
               minNorm = Math.min(minNorm, impacts.get(0).norm);
             }
 
-            return Collections.singletonList(new Impact(maxFreq * mergedImpactsPerField.size(), minNorm));
+            int amplifiedMaxFreq = maxFreq * mergedImpactsPerField.size();
+
+            // overflow
+            if (amplifiedMaxFreq < 0) {
+              amplifiedMaxFreq = Integer.MAX_VALUE;
+            }
+
+            return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm));
           }
 
 

From 8ecdc694336c26312be2e3fc12c7b67594541b39 Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Thu, 4 Nov 2021 21:16:52 -0700
Subject: [PATCH 04/16] Fix style

---
 .../sandbox/search/CombinedFieldQuery.java    | 88 +++++++++++--------
 .../search/TestCombinedFieldQuery.java        | 78 ++++++++--------
 2 files changed, 92 insertions(+), 74 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index bfd97da2ba2..a1ba083fc1e 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -29,7 +29,6 @@
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeMap;
-
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Impact;
@@ -113,7 +112,7 @@ public final class CombinedFieldQuery extends Query implements Accountable {
   }
 
   private static final long BASE_RAM_BYTES =
-          RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class);
+      RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class);
 
   /** A builder for {@link CombinedFieldQuery}. */
   public static class Builder {
@@ -469,7 +468,8 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
         }
         queue.add(
             new WeightedDisiWrapper(
-                new TermScorer(this, iterators.get(i), nonScoringSimScorer), fieldAndWeight.weight));
+                new TermScorer(this, iterators.get(i), nonScoringSimScorer),
+                fieldAndWeight.weight));
       }
 
       // Even though it is called approximation, it is accurate since none of
@@ -492,7 +492,10 @@ public boolean isCacheable(LeafReaderContext ctx) {
   }
 
   /** Merge impacts for combined field. */
-  static ImpactsSource mergeImpacts(Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums, Map<String, List<Impacts>> fieldsWithImpacts, Map<String, Float> fieldWeights) {
+  static ImpactsSource mergeImpacts(
+      Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums,
+      Map<String, List<Impacts>> fieldsWithImpacts,
+      Map<String, Float> fieldWeights) {
     return new ImpactsSource() {
 
       class SubIterator {
@@ -515,14 +518,15 @@ void next() {
         }
       }
 
-
       @Override
       public Impacts getImpacts() throws IOException {
-        // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for each field
+        // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for
+        // each field
         // They collectively will decide on the number of levels and the block boundaries.
         Map<String, Impacts> leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size());
 
-        for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts : fieldsWithImpactsEnums.entrySet()) {
+        for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts :
+            fieldsWithImpactsEnums.entrySet()) {
           String field = fieldImpacts.getKey();
           List<ImpactsEnum> impactsEnums = fieldImpacts.getValue();
           fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size()));
@@ -533,8 +537,7 @@ public Impacts getImpacts() throws IOException {
             Impacts impacts = impactsEnums.get(i).getImpacts();
             fieldsWithImpacts.get(field).add(impacts);
 
-            if (tmpLead == null ||
-                    impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
+            if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
               tmpLead = impacts;
             }
           }
@@ -550,7 +553,7 @@ public int numLevels() {
             int result = 0;
 
             for (Impacts impacts : leadingImpactsPerField.values()) {
-                result = Math.max(result, impacts.numLevels());
+              result = Math.max(result, impacts.numLevels());
             }
 
             return result;
@@ -581,17 +584,22 @@ public List<Impact> getImpacts(int level) {
           private Map<String, List<Impact>> mergeImpactsPerField(int docIdUpTo) {
             final Map<String, List<Impact>> result = new HashMap<>(fieldsWithImpactsEnums.size());
 
-            for (Map.Entry<String, List<ImpactsEnum>> impactsPerField : fieldsWithImpactsEnums.entrySet()) {
+            for (Map.Entry<String, List<ImpactsEnum>> impactsPerField :
+                fieldsWithImpactsEnums.entrySet()) {
               String field = impactsPerField.getKey();
               List<ImpactsEnum> impactsEnums = impactsPerField.getValue();
               List<Impacts> impacts = fieldsWithImpacts.get(field);
 
-              List<Impact> mergedImpacts = doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo);
+              List<Impact> mergedImpacts =
+                  doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo);
 
               if (mergedImpacts.size() == 0) {
-                // all impactEnums for this field were positioned beyond docIdUpTo, continue to next field
+                // all impactEnums for this field were positioned beyond docIdUpTo, continue to next
+                // field
                 continue;
-              } else if (mergedImpacts.size() == 1 && mergedImpacts.get(0).freq == Integer.MAX_VALUE && mergedImpacts.get(0).norm == 1L) {
+              } else if (mergedImpacts.size() == 1
+                  && mergedImpacts.get(0).freq == Integer.MAX_VALUE
+                  && mergedImpacts.get(0).norm == 1L) {
                 // one field gets impacts that trigger maximum score, pass it up
                 return Collections.singletonMap(field, mergedImpacts);
               } else {
@@ -602,9 +610,10 @@ private Map<String, List<Impact>> mergeImpactsPerField(int docIdUpTo) {
             return result;
           }
 
-
-          // Merge impacts from same field by summing freqs with the same norms - the same logic used for SynonymQuery
-          private List<Impact> doMergeImpactsPerField(String field, List<ImpactsEnum> impactsEnums, List<Impacts> impacts, int docIdUpTo) {
+          // Merge impacts from same field by summing freqs with the same norms - the same logic
+          // used for SynonymQuery
+          private List<Impact> doMergeImpactsPerField(
+              String field, List<ImpactsEnum> impactsEnums, List<Impacts> impacts, int docIdUpTo) {
             List<List<Impact>> toMerge = new ArrayList<>(impactsEnums.size());
 
             for (int i = 0; i < impactsEnums.size(); ++i) {
@@ -620,9 +629,11 @@ private List<Impact> doMergeImpactsPerField(String field, List<ImpactsEnum> impa
                   final List<Impact> originalImpactList = impacts.get(i).getImpacts(impactsLevel);
                   final List<Impact> impactList = new ArrayList<>(originalImpactList.size());
                   for (Impact impact : originalImpactList) {
-                    impactList.add(new Impact((int) Math.ceil(impact.freq * weight),
-                            SmallFloat.intToByte4((int) Math.floor(normToLength(impact.norm) * weight))));
-
+                    impactList.add(
+                        new Impact(
+                            (int) Math.ceil(impact.freq * weight),
+                            SmallFloat.intToByte4(
+                                (int) Math.floor(normToLength(impact.norm) * weight))));
                   }
                   toMerge.add(impactList);
                 } else {
@@ -642,18 +653,18 @@ private List<Impact> doMergeImpactsPerField(String field, List<ImpactsEnum> impa
             }
 
             PriorityQueue<SubIterator> pq =
-                    new PriorityQueue<SubIterator>(impacts.size()) {
-                      @Override
-                      protected boolean lessThan(SubIterator a, SubIterator b) {
-                        if (a.current == null) { // means iteration is finished
-                          return false;
-                        }
-                        if (b.current == null) {
-                          return true;
-                        }
-                        return Long.compareUnsigned(a.current.norm, b.current.norm) < 0;
-                      }
-                    };
+                new PriorityQueue<SubIterator>(impacts.size()) {
+                  @Override
+                  protected boolean lessThan(SubIterator a, SubIterator b) {
+                    if (a.current == null) { // means iteration is finished
+                      return false;
+                    }
+                    if (b.current == null) {
+                      return true;
+                    }
+                    return Long.compareUnsigned(a.current.norm, b.current.norm) < 0;
+                  }
+                };
 
             for (List<Impact> toMergeImpacts : toMerge) {
               pq.add(new SubIterator(toMergeImpacts.iterator()));
@@ -704,13 +715,16 @@ private int getLevel(Impacts impacts, int docIdUpTo) {
             return -1;
           }
 
-          private List<Impact> mergeImpactsAcrossFields(Map<String, List<Impact>> mergedImpactsPerField) {
+          private List<Impact> mergeImpactsAcrossFields(
+              Map<String, List<Impact>> mergedImpactsPerField) {
             if (mergedImpactsPerField.size() == 1) {
               return mergedImpactsPerField.values().iterator().next();
             }
 
-            // upper-bound by creating an impact that should be most competitive: <maxFreq * numOfFields, minNorm>
-            // this is done to avoid the potential combinatorial explosion from accurate computation on merged impacts across fields
+            // upper-bound by creating an impact that should be most competitive: <maxFreq *
+            // numOfFields, minNorm>
+            // this is done to avoid the potential combinatorial explosion from accurate computation
+            // on merged impacts across fields
             int maxFreq = 0;
             long minNorm = Long.MIN_VALUE;
             for (List<Impact> impacts : mergedImpactsPerField.values()) {
@@ -729,8 +743,6 @@ private List<Impact> mergeImpactsAcrossFields(Map<String, List<Impact>> mergedIm
 
             return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm));
           }
-
-
         };
       }
 
@@ -740,7 +752,7 @@ private float normToLength(long norm) {
 
       @Override
       public void advanceShallow(int target) throws IOException {
-        for (List<ImpactsEnum> impactsEnums: fieldsWithImpactsEnums.values()) {
+        for (List<ImpactsEnum> impactsEnums : fieldsWithImpactsEnums.values()) {
           for (ImpactsEnum impactsEnum : impactsEnums) {
             if (impactsEnum.docID() < target) {
               impactsEnum.advanceShallow(target);
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
index 4d5d3ac0b19..d464765489b 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
@@ -17,13 +17,11 @@
 package org.apache.lucene.sandbox.search;
 
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
-
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.concurrent.TimeUnit;
-
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.StringField;
@@ -173,47 +171,49 @@ public void testSameScore() throws IOException {
   }
 
   public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOException {
-    Path path = Paths.get("/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index");
+    Path path =
+        Paths.get(
+            "/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index");
 
     Directory index = FSDirectory.open(path);
     IndexReader reader = DirectoryReader.open(index);
     IndexSearcher searcher = new IndexSearcher(reader);
 
     // HighHigh: but publisher # freq=1456553 freq=1289029
-//    CombinedFieldQuery query =
-//            new CombinedFieldQuery.Builder()
-//                    .addField("title", (float) 4.0)
-//                    .addField("body", (float) 2.0)
-//                    .addTerm(new BytesRef("but"))
-//                    .addTerm(new BytesRef("publisher"))
-//                    .build();
+    //    CombinedFieldQuery query =
+    //            new CombinedFieldQuery.Builder()
+    //                    .addField("title", (float) 4.0)
+    //                    .addField("body", (float) 2.0)
+    //                    .addTerm(new BytesRef("but"))
+    //                    .addTerm(new BytesRef("publisher"))
+    //                    .build();
 
     // HighMed: from office # freq=3224339 freq=225338
-//    CombinedFieldQuery query =
-//            new CombinedFieldQuery.Builder()
-//                    .addField("title", (float) 4.0)
-//                    .addField("body", (float) 2.0)
-//                    .addTerm(new BytesRef("from"))
-//                    .addTerm(new BytesRef("office"))
-//                    .build();
+    //    CombinedFieldQuery query =
+    //            new CombinedFieldQuery.Builder()
+    //                    .addField("title", (float) 4.0)
+    //                    .addField("body", (float) 2.0)
+    //                    .addTerm(new BytesRef("from"))
+    //                    .addTerm(new BytesRef("office"))
+    //                    .build();
 
     // HighLow: with fung # freq=3709421 freq=1344
-//    CombinedFieldQuery query =
-//            new CombinedFieldQuery.Builder()
-//                    .addField("title", (float) 4.0)
-//                    .addField("body", (float) 2.0)
-//                    .addTerm(new BytesRef("with"))
-//                    .addTerm(new BytesRef("fung"))
-//                    .build();
+    //    CombinedFieldQuery query =
+    //            new CombinedFieldQuery.Builder()
+    //                    .addField("title", (float) 4.0)
+    //                    .addField("body", (float) 2.0)
+    //                    .addTerm(new BytesRef("with"))
+    //                    .addTerm(new BytesRef("fung"))
+    //                    .build();
 
     // HighLow: date insult # freq=2020626 freq=4424
     CombinedFieldQuery query =
-            new CombinedFieldQuery.Builder()
-                    .addField("title", (float) 4.0)
-                    .addField("body", (float) 2.0)
-                    .addTerm(new BytesRef("date"))
-                    .addTerm(new BytesRef("insult"))
-                    .build();
+        new CombinedFieldQuery.Builder()
+            .addField("title", (float) 4.0)
+            .addField("body", (float) 2.0)
+            .addTerm(new BytesRef("date"))
+            .addTerm(new BytesRef("insult"))
+            .build();
 
     long start;
     long end;
@@ -222,26 +222,32 @@ public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOExc
 
     TopScoreDocCollector topScoresCollector = null;
     start = System.nanoTime();
-    for (int i = 0 ; i < runCount ; i++) {
+    for (int i = 0; i < runCount; i++) {
       topScoresCollector = TopScoreDocCollector.create(100, null, 100); // TOP_SCORES
       searcher.search(query, topScoresCollector);
     }
     end = System.nanoTime();
     duration = (end - start) / runCount;
-    System.out.println("// top scores time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds");
-
+    System.out.println(
+        "// top scores time usage\t"
+            + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS)
+            + " milliseconds");
 
     TopScoreDocCollector completeCollector = null;
     start = System.nanoTime();
-    for (int i = 0 ; i < runCount ; i++) {
+    for (int i = 0; i < runCount; i++) {
       completeCollector = TopScoreDocCollector.create(100, null, Integer.MAX_VALUE); // COMPLETE
       searcher.search(query, completeCollector);
     }
     end = System.nanoTime();
     duration = (end - start) / runCount;
-    System.out.println("// complete time usage\t" + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS) + " milliseconds");
+    System.out.println(
+        "// complete time usage\t"
+            + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS)
+            + " milliseconds");
 
-    CheckHits.checkEqual(query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs);
+    CheckHits.checkEqual(
+        query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs);
 
     reader.close();
   }

From b4ac2b488cdb26b63b79bd7e792df4deb10c8d3f Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Sat, 6 Nov 2021 11:15:35 -0700
Subject: [PATCH 05/16] Update test

---
 .../randomization/policies/tests.policy       |   5 -
 .../search/TestCombinedFieldQuery.java        | 172 ++++++++++--------
 2 files changed, 100 insertions(+), 77 deletions(-)

diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy
index 178fc6bbfa5..71dc7991fdb 100644
--- a/gradle/testing/randomization/policies/tests.policy
+++ b/gradle/testing/randomization/policies/tests.policy
@@ -121,11 +121,6 @@ grant codeBase "file:${gradle.lib.dir}${/}-" {
   permission java.security.AllPermission;
 };
 
-// nocommit - hack to run test on indices stored elsewhere
-grant {
-  permission java.security.AllPermission;
-};
-
 grant codeBase "file:${gradle.worker.jar}" {
   permission java.security.AllPermission;
 };
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
index d464765489b..6f72169aa7c 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
@@ -16,17 +16,17 @@
  */
 package org.apache.lucene.sandbox.search;
 
+import static com.carrotsearch.randomizedtesting.RandomizedTest.atMost;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
+
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Arrays;
-import java.util.concurrent.TimeUnit;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -50,7 +50,6 @@
 import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@@ -171,85 +170,114 @@ public void testSameScore() throws IOException {
   }
 
   public void testSameScoreAndCollectionBetweenCompleteAndTopScores() throws IOException {
-    Path path =
-        Paths.get(
-            "/Users/xichen/IdeaProjects/benchmarks/indices/wikimedium10m.lucene_baseline.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.sortedset:Month.sortedset:DayOfYear.Lucene90.Lucene90.nd10M/index");
+    int numDocs =
+        randomBoolean()
+            ? atLeast(1000)
+            : atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data
+    int numMatchDoc = randomIntBetween(200, 500);
+    int numHits = atMost(100);
+    int boost1 = Math.max(1, random().nextInt(5));
+    int boost2 = Math.max(1, random().nextInt(5));
 
-    Directory index = FSDirectory.open(path);
-    IndexReader reader = DirectoryReader.open(index);
-    IndexSearcher searcher = new IndexSearcher(reader);
+    Directory dir = newDirectory();
+    Similarity similarity = randomCompatibleSimilarity();
+
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(similarity);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    // adding non-matching docs
+    for (int i = 0; i < numDocs - numMatchDoc; ++i) {
+      Document doc = new Document();
+
+      int freqA = random().nextInt(50) + 1;
+      for (int j = 0; j < freqA; j++) {
+        doc.add(new TextField("a", "bar" + j, Store.NO));
+      }
+
+      int freqB = random().nextInt(50) + 1;
+      for (int j = 0; j < freqB; j++) {
+        doc.add(new TextField("b", "bla" + j, Store.NO));
+      }
+      int freqC = random().nextInt(50) + 1;
+      for (int j = 0; j < freqC; j++) {
+        doc.add(new TextField("c", "bla" + j, Store.NO));
+      }
+      w.addDocument(doc);
+    }
+
+    // adding potentially matching doc
+    for (int i = 0; i < numMatchDoc; i++) {
+      Document doc = new Document();
+
+      int freqA = random().nextInt(20) + 1;
+      if (randomBoolean()) {
+        for (int j = 0; j < freqA; j++) {
+          doc.add(new TextField("a", "foo", Store.NO));
+        }
+      }
+
+      freqA = random().nextInt(20) + 1;
+      if (randomBoolean()) {
+        for (int j = 0; j < freqA; j++) {
+          doc.add(new TextField("a", "foo" + j, Store.NO));
+        }
+      }
+
+      freqA = random().nextInt(20) + 1;
+      if (randomBoolean()) {
+        for (int j = 0; j < freqA; j++) {
+          doc.add(new TextField("a", "zoo", Store.NO));
+        }
+      }
+
+      int freqB = random().nextInt(20) + 1;
+      if (randomBoolean()) {
+        for (int j = 0; j < freqB; j++) {
+          doc.add(new TextField("b", "zoo", Store.NO));
+        }
+      }
+
+      freqB = random().nextInt(20) + 1;
+      if (randomBoolean()) {
+        for (int j = 0; j < freqB; j++) {
+          doc.add(new TextField("b", "zoo" + j, Store.NO));
+        }
+      }
+
+      int freqC = random().nextInt(20) + 1;
+      for (int j = 0; j < freqC; j++) {
+        doc.add(new TextField("c", "bla" + j, Store.NO));
+      }
+      w.addDocument(doc);
+    }
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(similarity);
 
-    // HighHigh: but publisher # freq=1456553 freq=1289029
-    //    CombinedFieldQuery query =
-    //            new CombinedFieldQuery.Builder()
-    //                    .addField("title", (float) 4.0)
-    //                    .addField("body", (float) 2.0)
-    //                    .addTerm(new BytesRef("but"))
-    //                    .addTerm(new BytesRef("publisher"))
-    //                    .build();
-
-    // HighMed: from office # freq=3224339 freq=225338
-    //    CombinedFieldQuery query =
-    //            new CombinedFieldQuery.Builder()
-    //                    .addField("title", (float) 4.0)
-    //                    .addField("body", (float) 2.0)
-    //                    .addTerm(new BytesRef("from"))
-    //                    .addTerm(new BytesRef("office"))
-    //                    .build();
-
-    // HighLow: with fung # freq=3709421 freq=1344
-    //    CombinedFieldQuery query =
-    //            new CombinedFieldQuery.Builder()
-    //                    .addField("title", (float) 4.0)
-    //                    .addField("body", (float) 2.0)
-    //                    .addTerm(new BytesRef("with"))
-    //                    .addTerm(new BytesRef("fung"))
-    //                    .build();
-
-    // HighLow: date insult # freq=2020626 freq=4424
     CombinedFieldQuery query =
         new CombinedFieldQuery.Builder()
-            .addField("title", (float) 4.0)
-            .addField("body", (float) 2.0)
-            .addTerm(new BytesRef("date"))
-            .addTerm(new BytesRef("insult"))
+            .addField("a", (float) boost1)
+            .addField("b", (float) boost2)
+            .addTerm(new BytesRef("foo"))
+            .addTerm(new BytesRef("zoo"))
             .build();
 
-    long start;
-    long end;
-    long duration;
-    int runCount = 20;
+    TopScoreDocCollector topScoresCollector =
+        TopScoreDocCollector.create(numHits, null, numHits); // TOP_SCORES
+    searcher.search(query, topScoresCollector);
 
-    TopScoreDocCollector topScoresCollector = null;
-    start = System.nanoTime();
-    for (int i = 0; i < runCount; i++) {
-      topScoresCollector = TopScoreDocCollector.create(100, null, 100); // TOP_SCORES
-      searcher.search(query, topScoresCollector);
-    }
-    end = System.nanoTime();
-    duration = (end - start) / runCount;
-    System.out.println(
-        "// top scores time usage\t"
-            + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS)
-            + " milliseconds");
-
-    TopScoreDocCollector completeCollector = null;
-    start = System.nanoTime();
-    for (int i = 0; i < runCount; i++) {
-      completeCollector = TopScoreDocCollector.create(100, null, Integer.MAX_VALUE); // COMPLETE
-      searcher.search(query, completeCollector);
-    }
-    end = System.nanoTime();
-    duration = (end - start) / runCount;
-    System.out.println(
-        "// complete time usage\t"
-            + TimeUnit.MILLISECONDS.convert(duration, TimeUnit.NANOSECONDS)
-            + " milliseconds");
+    TopScoreDocCollector completeCollector =
+        TopScoreDocCollector.create(numHits, null, Integer.MAX_VALUE); // COMPLETE
+    searcher.search(query, completeCollector);
 
     CheckHits.checkEqual(
         query, completeCollector.topDocs().scoreDocs, topScoresCollector.topDocs().scoreDocs);
 
     reader.close();
+    w.close();
+    dir.close();
   }
 
   public void testNormsDisabled() throws IOException {

From 4244f39d6f63161b04ae7eefe502212bc06f6bea Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Sat, 6 Nov 2021 11:40:55 -0700
Subject: [PATCH 06/16] Update amplifiedMaxFreq calculation

---
 .../lucene/sandbox/search/CombinedFieldQuery.java  | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index a1ba083fc1e..671855f9463 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -734,13 +734,17 @@ private List<Impact> mergeImpactsAcrossFields(
               minNorm = Math.min(minNorm, impacts.get(0).norm);
             }
 
-            int amplifiedMaxFreq = maxFreq * mergedImpactsPerField.size();
-
-            // overflow
-            if (amplifiedMaxFreq < 0) {
-              amplifiedMaxFreq = Integer.MAX_VALUE;
+            int amplifiedMaxFreq = 0;
+            if (maxFreq == Integer.MAX_VALUE) {
+              amplifiedMaxFreq = maxFreq;
+            } else {
+              amplifiedMaxFreq =
+                  (int) Math.min((long) maxFreq * mergedImpactsPerField.size(), Integer.MAX_VALUE);
             }
 
+            // no overflow should occur
+            assert amplifiedMaxFreq > 0;
+
             return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm));
           }
         };

From 81206d44d5bac5d4249bba4b94058a2e8aed0e2d Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Sun, 7 Nov 2021 10:19:36 -0800
Subject: [PATCH 07/16] Simplify and reduce maxFreq

Run 1
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                      CFQHighMed       16.99      (4.9%)       16.84     (14.9%)   -0.9% ( -19% -   19%) 0.790
                        PKLookup       98.49      (8.7%)      121.99      (9.1%)   23.9% (   5% -   45%) 0.000
                     CFQHighHigh        5.82      (4.7%)        7.55     (10.0%)   29.8% (  14% -   46%) 0.000
                      CFQHighLow       17.59      (5.2%)       25.01     (17.3%)   42.2% (  18% -   68%) 0.000

Run 2
                            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
                      CFQHighMed       18.28      (6.1%)       15.63     (15.6%)  -14.5% ( -34% -    7%) 0.000
                        PKLookup      101.26     (10.0%)      105.39     (12.0%)    4.1% ( -16% -   28%) 0.243
                     CFQHighHigh        9.96      (5.6%)       10.47     (13.0%)    5.2% ( -12% -   25%) 0.101
                      CFQHighLow        9.71      (5.8%)       15.71     (34.0%)   61.8% (  20% -  107%) 0.000
---
 .../sandbox/search/CombinedFieldQuery.java    | 31 +++++++------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index 671855f9463..5a266846636 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -423,8 +423,8 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
     public Scorer scorer(LeafReaderContext context) throws IOException {
       List<PostingsEnum> iterators = new ArrayList<>();
       List<FieldAndWeight> fields = new ArrayList<>();
-      Map<String, List<ImpactsEnum>> fieldImpactsEnum = new HashMap<>();
-      Map<String, List<Impacts>> fieldImpacts = new HashMap<>();
+      Map<String, List<ImpactsEnum>> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size());
+      Map<String, List<Impacts>> fieldImpacts = new HashMap<>(fieldAndWeights.size());
 
       for (int i = 0; i < fieldTerms.length; i++) {
         TermState state = termStates[i].get(context);
@@ -458,7 +458,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
 
       // we use termscorers + disjunction as an impl detail
       DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size());
-      Map<String, Float> fieldWeights = new HashMap<>();
+      Map<String, Float> fieldWeights = new HashMap<>(fieldImpactsEnum.size());
       for (int i = 0; i < iterators.size(); i++) {
         FieldAndWeight fieldAndWeight = fields.get(i);
         if (fieldWeights.containsKey(fieldAndWeight.field)) {
@@ -721,31 +721,22 @@ private List<Impact> mergeImpactsAcrossFields(
               return mergedImpactsPerField.values().iterator().next();
             }
 
-            // upper-bound by creating an impact that should be most competitive: <maxFreq *
-            // numOfFields, minNorm>
-            // this is done to avoid the potential combinatorial explosion from accurate computation
+            // upper-bound by creating an impact that should be most competitive: <maxFreqSum,
+            // minNorm>
+            // this is done to avoid the potential costly combinatorial explosion from accurate
+            // computation
             // on merged impacts across fields
-            int maxFreq = 0;
+            long maxFreqSum = 0;
             long minNorm = Long.MIN_VALUE;
             for (List<Impact> impacts : mergedImpactsPerField.values()) {
               // highest freq at the end of each impact list
-              maxFreq = Math.max(maxFreq, impacts.get(impacts.size() - 1).freq);
+              maxFreqSum += impacts.get(impacts.size() - 1).freq;
               // lowest norm at the start of each impact list
               minNorm = Math.min(minNorm, impacts.get(0).norm);
             }
 
-            int amplifiedMaxFreq = 0;
-            if (maxFreq == Integer.MAX_VALUE) {
-              amplifiedMaxFreq = maxFreq;
-            } else {
-              amplifiedMaxFreq =
-                  (int) Math.min((long) maxFreq * mergedImpactsPerField.size(), Integer.MAX_VALUE);
-            }
-
-            // no overflow should occur
-            assert amplifiedMaxFreq > 0;
-
-            return Collections.singletonList(new Impact(amplifiedMaxFreq, minNorm));
+            return Collections.singletonList(
+                new Impact((int) Math.min(maxFreqSum, Integer.MAX_VALUE), minNorm));
           }
         };
       }

From 6d5e7808fd847d5849e7c4ff3b552aeb9c196bbb Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Thu, 18 Nov 2021 23:49:42 -0800
Subject: [PATCH 08/16] cache leadingImpactsPerField

---
 .../sandbox/search/CombinedFieldQuery.java    | 40 ++++++++++---------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index 1e69a232764..d151bd917e2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -497,6 +497,7 @@ static ImpactsSource mergeImpacts(
       Map<String, List<Impacts>> fieldsWithImpacts,
       Map<String, Float> fieldWeights) {
     return new ImpactsSource() {
+      Map<String, Impacts> leadingImpactsPerField = null;
 
       class SubIterator {
         final Iterator<Impact> iterator;
@@ -523,26 +524,28 @@ public Impacts getImpacts() throws IOException {
         // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for
         // each field
         // They collectively will decide on the number of levels and the block boundaries.
-        Map<String, Impacts> leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size());
-
-        for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts :
-            fieldsWithImpactsEnums.entrySet()) {
-          String field = fieldImpacts.getKey();
-          List<ImpactsEnum> impactsEnums = fieldImpacts.getValue();
-          fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size()));
-
-          Impacts tmpLead = null;
-          // find the impact that has the lowest next boundary for this field
-          for (int i = 0; i < impactsEnums.size(); ++i) {
-            Impacts impacts = impactsEnums.get(i).getImpacts();
-            fieldsWithImpacts.get(field).add(impacts);
-
-            if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
-              tmpLead = impacts;
+
+        if (leadingImpactsPerField == null) {
+          leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size());
+          for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts :
+              fieldsWithImpactsEnums.entrySet()) {
+            String field = fieldImpacts.getKey();
+            List<ImpactsEnum> impactsEnums = fieldImpacts.getValue();
+            fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size()));
+
+            Impacts tmpLead = null;
+            // find the impact that has the lowest next boundary for this field
+            for (int i = 0; i < impactsEnums.size(); ++i) {
+              Impacts impacts = impactsEnums.get(i).getImpacts();
+              fieldsWithImpacts.get(field).add(impacts);
+
+              if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
+                tmpLead = impacts;
+              }
             }
-          }
 
-          leadingImpactsPerField.put(field, tmpLead);
+            leadingImpactsPerField.put(field, tmpLead);
+          }
         }
 
         return new Impacts() {
@@ -754,6 +757,7 @@ public void advanceShallow(int target) throws IOException {
             }
           }
         }
+        leadingImpactsPerField = null;
       }
     };
   }

From db2446f0337e5860bb8c731df1f6c16281efb6bb Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Mon, 22 Nov 2021 01:16:35 -0800
Subject: [PATCH 09/16] test leadImpact from highest weighted field

---
 .../sandbox/search/CombinedFieldQuery.java    | 85 ++++++++++++-------
 1 file changed, 52 insertions(+), 33 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index d151bd917e2..a04316b3457 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -424,6 +424,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
       List<PostingsEnum> iterators = new ArrayList<>();
       List<FieldAndWeight> fields = new ArrayList<>();
       Map<String, List<ImpactsEnum>> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size());
+      Map<String, List<Integer>> fieldTermDocFreq = new HashMap<>(fieldAndWeights.size());
       Map<String, List<Impacts>> fieldImpacts = new HashMap<>(fieldAndWeights.size());
 
       for (int i = 0; i < fieldTerms.length; i++) {
@@ -432,6 +433,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
           String fieldName = fieldTerms[i].field();
           fields.add(fieldAndWeights.get(fieldName));
           fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>());
+          fieldTermDocFreq.putIfAbsent(fieldName, new ArrayList<>());
 
           TermsEnum termsEnum = context.reader().terms(fieldName).iterator();
           termsEnum.seekExact(fieldTerms[i].bytes(), state);
@@ -440,6 +442,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
             ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
             iterators.add(impactsEnum);
             fieldImpactsEnum.get(fieldName).add(impactsEnum);
+            fieldTermDocFreq.get(fieldName).add(termsEnum.docFreq());
           } else {
             PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS);
             iterators.add(postingsEnum);
@@ -478,7 +481,8 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
       ImpactsDISI impactsDisi = null;
 
       if (scoreMode == ScoreMode.TOP_SCORES) {
-        ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights);
+        ImpactsSource impactsSource =
+            mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldTermDocFreq, fieldWeights);
         iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight);
       }
 
@@ -495,9 +499,10 @@ public boolean isCacheable(LeafReaderContext ctx) {
   static ImpactsSource mergeImpacts(
       Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums,
       Map<String, List<Impacts>> fieldsWithImpacts,
+      Map<String, List<Integer>> fieldTermDocFreq,
       Map<String, Float> fieldWeights) {
     return new ImpactsSource() {
-      Map<String, Impacts> leadingImpactsPerField = null;
+      Impacts leadingImpacts = null;
 
       class SubIterator {
         final Iterator<Impact> iterator;
@@ -525,55 +530,69 @@ public Impacts getImpacts() throws IOException {
         // each field
         // They collectively will decide on the number of levels and the block boundaries.
 
-        if (leadingImpactsPerField == null) {
-          leadingImpactsPerField = new HashMap<>(fieldsWithImpactsEnums.size());
+        if (leadingImpacts == null) {
+          float maxWeight = Float.MIN_VALUE;
+          String maxWeightField = "";
+
+          for (Map.Entry<String, Float> fieldWeightEntry : fieldWeights.entrySet()) {
+            String field = fieldWeightEntry.getKey();
+            float weight = fieldWeightEntry.getValue();
+
+            if (maxWeight < weight) {
+              maxWeight = weight;
+              maxWeightField = field;
+            }
+          }
+
+          Impacts tmpLead = null;
           for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts :
               fieldsWithImpactsEnums.entrySet()) {
             String field = fieldImpacts.getKey();
             List<ImpactsEnum> impactsEnums = fieldImpacts.getValue();
+            //            List<Integer> docFreqs = fieldTermDocFreq.get(field);
             fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size()));
 
-            Impacts tmpLead = null;
-            // find the impact that has the lowest next boundary for this field
-            for (int i = 0; i < impactsEnums.size(); ++i) {
-              Impacts impacts = impactsEnums.get(i).getImpacts();
-              fieldsWithImpacts.get(field).add(impacts);
-
-              if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
-                tmpLead = impacts;
+            if (field.equals(maxWeightField)) {
+              int minDocFreq = Integer.MAX_VALUE;
+              for (int i = 0; i < impactsEnums.size(); ++i) {
+                Impacts impacts = impactsEnums.get(i).getImpacts();
+                fieldsWithImpacts.get(field).add(impacts);
+
+                // use the impact of term within this mostly weighted field that has the least doc
+                // freq
+                // this may have the result of getting larger upTo bound, as low doc freq term may
+                // also have large gap in doc ids
+                //  int docFreq = docFreqs.get(i);
+                //                if (tmpLead == null || docFreq < minDocFreq) {
+                //                  minDocFreq = docFreq;
+                //                  tmpLead = impacts;
+                //                }
+
+                if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
+                  tmpLead = impacts;
+                }
+              }
+            } else {
+              // find the impact that has the lowest next boundary for this field
+              for (int i = 0; i < impactsEnums.size(); ++i) {
+                Impacts impacts = impactsEnums.get(i).getImpacts();
+                fieldsWithImpacts.get(field).add(impacts);
               }
             }
-
-            leadingImpactsPerField.put(field, tmpLead);
           }
+          leadingImpacts = tmpLead;
         }
 
         return new Impacts() {
 
           @Override
           public int numLevels() {
-            // max of levels across fields' impactEnums
-            int result = 0;
-
-            for (Impacts impacts : leadingImpactsPerField.values()) {
-              result = Math.max(result, impacts.numLevels());
-            }
-
-            return result;
+            return leadingImpacts.numLevels();
           }
 
           @Override
           public int getDocIdUpTo(int level) {
-            // min of docIdUpTo across fields' impactEnums
-            int result = Integer.MAX_VALUE;
-
-            for (Impacts impacts : leadingImpactsPerField.values()) {
-              if (impacts.numLevels() > level) {
-                result = Math.min(result, impacts.getDocIdUpTo(level));
-              }
-            }
-
-            return result;
+            return leadingImpacts.getDocIdUpTo(level);
           }
 
           @Override
@@ -757,7 +776,7 @@ public void advanceShallow(int target) throws IOException {
             }
           }
         }
-        leadingImpactsPerField = null;
+        leadingImpacts = null;
       }
     };
   }

From 3d0a215d0174f13b508560bd476cd68475f05864 Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Sun, 28 Nov 2021 18:18:10 -0800
Subject: [PATCH 10/16] Fix bug

Results from python3 src/python/localrun.py -source combinedFieldsUnevenlyWeightedBig:

Run 1:
	    TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
     CFQHighHigh        2.81     (10.9%)        2.40     (10.3%)  -14.6% ( -32% -    7%) 0.000
      CFQHighMed        3.27     (11.8%)        2.91     (11.0%)  -11.2% ( -30% -   13%) 0.002
      CFQHighLow       16.09     (14.6%)       14.58     (11.9%)   -9.4% ( -31% -   20%) 0.025
	PKLookup       92.08     (11.6%)       96.23     (13.3%)    4.5% ( -18% -   33%) 0.255

Run 2:
	    TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
      CFQHighLow       15.76     (12.0%)       10.19     (12.9%)  -35.3% ( -53% -  -11%) 0.000
      CFQHighMed       12.01     (13.3%)        8.79     (13.9%)  -26.9% ( -47% -    0%) 0.000
     CFQHighHigh        5.59     (10.8%)        4.40      (8.4%)  -21.2% ( -36% -   -2%) 0.000
	PKLookup       93.36     (21.9%)       94.29     (22.6%)    1.0% ( -35% -   58%) 0.888
---
 .../org/apache/lucene/sandbox/search/CombinedFieldQuery.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index a04316b3457..dc90476152d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -749,7 +749,7 @@ private List<Impact> mergeImpactsAcrossFields(
             // computation
             // on merged impacts across fields
             long maxFreqSum = 0;
-            long minNorm = Long.MIN_VALUE;
+            long minNorm = Long.MAX_VALUE;
             for (List<Impact> impacts : mergedImpactsPerField.values()) {
               // highest freq at the end of each impact list
               maxFreqSum += impacts.get(impacts.size() - 1).freq;

From 8a7ea9998b189649ccdc75ecdd4b096460abb0af Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Wed, 1 Dec 2021 20:34:40 -0800
Subject: [PATCH 11/16] Address feedback - avoid repeated computation for max
 weighted field, replace doc freq collection with ImpactsEnum#cost

---
 .../sandbox/search/CombinedFieldQuery.java    | 55 +++++++++----------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index dc90476152d..9833a3e1a49 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -424,25 +424,30 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
       List<PostingsEnum> iterators = new ArrayList<>();
       List<FieldAndWeight> fields = new ArrayList<>();
       Map<String, List<ImpactsEnum>> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size());
-      Map<String, List<Integer>> fieldTermDocFreq = new HashMap<>(fieldAndWeights.size());
       Map<String, List<Impacts>> fieldImpacts = new HashMap<>(fieldAndWeights.size());
 
+      float maxWeight = Float.MIN_VALUE;
+      String maxWeightField = "";
       for (int i = 0; i < fieldTerms.length; i++) {
         TermState state = termStates[i].get(context);
         if (state != null) {
           String fieldName = fieldTerms[i].field();
           fields.add(fieldAndWeights.get(fieldName));
           fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>());
-          fieldTermDocFreq.putIfAbsent(fieldName, new ArrayList<>());
 
           TermsEnum termsEnum = context.reader().terms(fieldName).iterator();
           termsEnum.seekExact(fieldTerms[i].bytes(), state);
 
           if (scoreMode == ScoreMode.TOP_SCORES) {
+            float weight = fieldAndWeights.get(fieldName).weight;
+            if (maxWeight < weight) {
+              maxWeight = weight;
+              maxWeightField = fieldName;
+            }
+
             ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
             iterators.add(impactsEnum);
             fieldImpactsEnum.get(fieldName).add(impactsEnum);
-            fieldTermDocFreq.get(fieldName).add(termsEnum.docFreq());
           } else {
             PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS);
             iterators.add(postingsEnum);
@@ -482,7 +487,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
 
       if (scoreMode == ScoreMode.TOP_SCORES) {
         ImpactsSource impactsSource =
-            mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldTermDocFreq, fieldWeights);
+            mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights, maxWeightField);
         iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight);
       }
 
@@ -499,8 +504,8 @@ public boolean isCacheable(LeafReaderContext ctx) {
   static ImpactsSource mergeImpacts(
       Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums,
       Map<String, List<Impacts>> fieldsWithImpacts,
-      Map<String, List<Integer>> fieldTermDocFreq,
-      Map<String, Float> fieldWeights) {
+      Map<String, Float> fieldWeights,
+      String maxWeightField) {
     return new ImpactsSource() {
       Impacts leadingImpacts = null;
 
@@ -531,19 +536,6 @@ public Impacts getImpacts() throws IOException {
         // They collectively will decide on the number of levels and the block boundaries.
 
         if (leadingImpacts == null) {
-          float maxWeight = Float.MIN_VALUE;
-          String maxWeightField = "";
-
-          for (Map.Entry<String, Float> fieldWeightEntry : fieldWeights.entrySet()) {
-            String field = fieldWeightEntry.getKey();
-            float weight = fieldWeightEntry.getValue();
-
-            if (maxWeight < weight) {
-              maxWeight = weight;
-              maxWeightField = field;
-            }
-          }
-
           Impacts tmpLead = null;
           for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts :
               fieldsWithImpactsEnums.entrySet()) {
@@ -553,24 +545,27 @@ public Impacts getImpacts() throws IOException {
             fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size()));
 
             if (field.equals(maxWeightField)) {
-              int minDocFreq = Integer.MAX_VALUE;
+              long minCost = Long.MAX_VALUE;
               for (int i = 0; i < impactsEnums.size(); ++i) {
-                Impacts impacts = impactsEnums.get(i).getImpacts();
+                ImpactsEnum impactsEnum = impactsEnums.get(i);
+                Impacts impacts = impactsEnum.getImpacts();
                 fieldsWithImpacts.get(field).add(impacts);
 
                 // use the impact of term within this mostly weighted field that has the least doc
-                // freq
-                // this may have the result of getting larger upTo bound, as low doc freq term may
+                // freq (cost)
+                // this may have the result of getting larger upTo bound, as low doc freq (cost)
+                // term may
                 // also have large gap in doc ids
-                //  int docFreq = docFreqs.get(i);
-                //                if (tmpLead == null || docFreq < minDocFreq) {
-                //                  minDocFreq = docFreq;
-                //                  tmpLead = impacts;
-                //                }
-
-                if (tmpLead == null || impacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
+                //                long currentCost = impactsEnums.get(i).cost();
+                if (tmpLead == null || impactsEnum.cost() < minCost) {
+                  minCost = impactsEnum.cost();
                   tmpLead = impacts;
                 }
+
+                //                if (tmpLead == null || impacts.getDocIdUpTo(0) <
+                // tmpLead.getDocIdUpTo(0)) {
+                //                  tmpLead = impacts;
+                //                }
               }
             } else {
               // find the impact that has the lowest next boundary for this field

From 0a9bdccf01d568b0239cf894ff626e47d8333df3 Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Wed, 1 Dec 2021 21:16:15 -0800
Subject: [PATCH 12/16] Refactoring - extract out logic that merges impacts
 within the same field

---
 .../lucene/search/ImpactsMergingUtils.java    | 141 ++++++++++++++++++
 .../apache/lucene/search/SynonymQuery.java    | 122 +--------------
 2 files changed, 143 insertions(+), 120 deletions(-)
 create mode 100644 lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java

diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
new file mode 100644
index 00000000000..856ad31cb2b
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
@@ -0,0 +1,141 @@
+package org.apache.lucene.search;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import org.apache.lucene.index.Impact;
+import org.apache.lucene.index.Impacts;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Utils for merging impacts for SynonymQuery, CombinedFieldsQuery etc
+ *
+ * @lucene.internal
+ */
+public class ImpactsMergingUtils {
+
+  /**
+   * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} if
+   * there is no such level.
+   */
+  private static int getLevel(Impacts impacts, int docIdUpTo) {
+    for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
+      if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
+        return level;
+      }
+    }
+    return -1;
+  }
+
+  private static class SubIterator {
+    final Iterator<Impact> iterator;
+    int previousFreq;
+    Impact current;
+
+    SubIterator(Iterator<Impact> iterator) {
+      this.iterator = iterator;
+      this.current = iterator.next();
+    }
+
+    void next() {
+      previousFreq = current.freq;
+      if (iterator.hasNext() == false) {
+        current = null;
+      } else {
+        current = iterator.next();
+      }
+    }
+  }
+
+  /**
+   * Merge impacts from multiple impactsEnum (terms matches) within the same field.
+   * The high level logic is to combine freqs that have the same norm from impacts.
+   */
+  public static List<Impact> mergeImpactsPerField(
+      ImpactsEnum[] impactsEnum, final Impacts[] impacts, float[] boosts, int docIdUpTo) {
+    List<List<Impact>> toMerge = new ArrayList<>();
+
+    for (int i = 0; i < impactsEnum.length; ++i) {
+      if (impactsEnum[i].docID() <= docIdUpTo) {
+        int impactsLevel = getLevel(impacts[i], docIdUpTo);
+        if (impactsLevel == -1) {
+          // One instance doesn't have impacts that cover up to docIdUpTo
+          // Return impacts that trigger the maximum score
+          return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
+        }
+        final List<Impact> impactList;
+        if (boosts[i] != 1f) {
+          float boost = boosts[i];
+          impactList =
+              impacts[i].getImpacts(impactsLevel).stream()
+                  .map(impact -> new Impact((int) Math.ceil(impact.freq * boost), impact.norm))
+                  .collect(Collectors.toList());
+        } else {
+          impactList = impacts[i].getImpacts(impactsLevel);
+        }
+        toMerge.add(impactList);
+      }
+    }
+    assert toMerge.size() > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong
+
+    if (toMerge.size() == 1) {
+      // common if one synonym is common and the other one is rare
+      return toMerge.get(0);
+    }
+
+    PriorityQueue<SubIterator> pq =
+        new PriorityQueue<SubIterator>(impacts.length) {
+          @Override
+          protected boolean lessThan(SubIterator a, SubIterator b) {
+            if (a.current == null) { // means iteration is finished
+              return false;
+            }
+            if (b.current == null) {
+              return true;
+            }
+            return Long.compareUnsigned(a.current.norm, b.current.norm) < 0;
+          }
+        };
+    for (List<Impact> toMergeImpacts : toMerge) {
+      pq.add(new SubIterator(toMergeImpacts.iterator()));
+    }
+
+    List<Impact> mergedImpacts = new ArrayList<>();
+
+    // Idea: merge impacts by norm. The tricky thing is that we need to
+    // consider norm values that are not in the impacts too. For
+    // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
+    // there might well be a document that has a freq of 2 and a length of 11,
+    // which was just not added to the list of impacts because {freq=2,norm=10}
+    // is more competitive. So the way it works is that we track the sum of
+    // the term freqs that we have seen so far in order to account for these
+    // implicit impacts.
+
+    long sumTf = 0;
+    SubIterator top = pq.top();
+    do {
+      final long norm = top.current.norm;
+      do {
+        sumTf += top.current.freq - top.previousFreq;
+        top.next();
+        top = pq.updateTop();
+      } while (top.current != null && top.current.norm == norm);
+
+      final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf);
+      if (mergedImpacts.isEmpty()) {
+        mergedImpacts.add(new Impact(freqUpperBound, norm));
+      } else {
+        Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1);
+        assert Long.compareUnsigned(prevImpact.norm, norm) < 0;
+        if (freqUpperBound > prevImpact.freq) {
+          mergedImpacts.add(new Impact(freqUpperBound, norm));
+        } // otherwise the previous impact is already more competitive
+      }
+    } while (top.current != null);
+
+    return mergedImpacts;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
index 2aefe3f7860..7905fa60ac3 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
@@ -21,7 +21,6 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Objects;
 import java.util.stream.Collectors;
@@ -40,7 +39,6 @@
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.PriorityQueue;
 
 /**
  * A query that treats multiple terms as synonyms.
@@ -346,26 +344,6 @@ static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums, float[] boosts) {
     assert impactsEnums.length == boosts.length;
     return new ImpactsSource() {
 
-      class SubIterator {
-        final Iterator<Impact> iterator;
-        int previousFreq;
-        Impact current;
-
-        SubIterator(Iterator<Impact> iterator) {
-          this.iterator = iterator;
-          this.current = iterator.next();
-        }
-
-        void next() {
-          previousFreq = current.freq;
-          if (iterator.hasNext() == false) {
-            current = null;
-          } else {
-            current = iterator.next();
-          }
-        }
-      }
-
       @Override
       public Impacts getImpacts() throws IOException {
         final Impacts[] impacts = new Impacts[impactsEnums.length];
@@ -393,107 +371,11 @@ public int getDocIdUpTo(int level) {
             return lead.getDocIdUpTo(level);
           }
 
-          /**
-           * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1}
-           * if there is no such level.
-           */
-          private int getLevel(Impacts impacts, int docIdUpTo) {
-            for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
-              if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
-                return level;
-              }
-            }
-            return -1;
-          }
-
           @Override
           public List<Impact> getImpacts(int level) {
             final int docIdUpTo = getDocIdUpTo(level);
-
-            List<List<Impact>> toMerge = new ArrayList<>();
-
-            for (int i = 0; i < impactsEnums.length; ++i) {
-              if (impactsEnums[i].docID() <= docIdUpTo) {
-                int impactsLevel = getLevel(impacts[i], docIdUpTo);
-                if (impactsLevel == -1) {
-                  // One instance doesn't have impacts that cover up to docIdUpTo
-                  // Return impacts that trigger the maximum score
-                  return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
-                }
-                final List<Impact> impactList;
-                if (boosts[i] != 1f) {
-                  float boost = boosts[i];
-                  impactList =
-                      impacts[i].getImpacts(impactsLevel).stream()
-                          .map(
-                              impact ->
-                                  new Impact((int) Math.ceil(impact.freq * boost), impact.norm))
-                          .collect(Collectors.toList());
-                } else {
-                  impactList = impacts[i].getImpacts(impactsLevel);
-                }
-                toMerge.add(impactList);
-              }
-            }
-            assert toMerge.size()
-                > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong
-
-            if (toMerge.size() == 1) {
-              // common if one synonym is common and the other one is rare
-              return toMerge.get(0);
-            }
-
-            PriorityQueue<SubIterator> pq =
-                new PriorityQueue<SubIterator>(impacts.length) {
-                  @Override
-                  protected boolean lessThan(SubIterator a, SubIterator b) {
-                    if (a.current == null) { // means iteration is finished
-                      return false;
-                    }
-                    if (b.current == null) {
-                      return true;
-                    }
-                    return Long.compareUnsigned(a.current.norm, b.current.norm) < 0;
-                  }
-                };
-            for (List<Impact> impacts : toMerge) {
-              pq.add(new SubIterator(impacts.iterator()));
-            }
-
-            List<Impact> mergedImpacts = new ArrayList<>();
-
-            // Idea: merge impacts by norm. The tricky thing is that we need to
-            // consider norm values that are not in the impacts too. For
-            // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
-            // there might well be a document that has a freq of 2 and a length of 11,
-            // which was just not added to the list of impacts because {freq=2,norm=10}
-            // is more competitive. So the way it works is that we track the sum of
-            // the term freqs that we have seen so far in order to account for these
-            // implicit impacts.
-
-            long sumTf = 0;
-            SubIterator top = pq.top();
-            do {
-              final long norm = top.current.norm;
-              do {
-                sumTf += top.current.freq - top.previousFreq;
-                top.next();
-                top = pq.updateTop();
-              } while (top.current != null && top.current.norm == norm);
-
-              final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf);
-              if (mergedImpacts.isEmpty()) {
-                mergedImpacts.add(new Impact(freqUpperBound, norm));
-              } else {
-                Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1);
-                assert Long.compareUnsigned(prevImpact.norm, norm) < 0;
-                if (freqUpperBound > prevImpact.freq) {
-                  mergedImpacts.add(new Impact(freqUpperBound, norm));
-                } // otherwise the previous impact is already more competitive
-              }
-            } while (top.current != null);
-
-            return mergedImpacts;
+            return ImpactsMergingUtils.mergeImpactsPerField(
+                impactsEnums, impacts, boosts, docIdUpTo);
           }
         };
       }

From 808fec28ab6d7179974665e7fa069270845811fa Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Wed, 1 Dec 2021 22:05:51 -0800
Subject: [PATCH 13/16] Refactoring - use ImpactsMergingUtils in
 CombinedFieldsQuery

            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
      CFQHighLow       21.45     (13.8%)       15.65     (18.0%)  -27.1% ( -51% -    5%) 0.000
      CFQHighMed        5.92      (4.2%)        4.52     (12.2%)  -23.7% ( -38% -   -7%) 0.000
     CFQHighHigh        5.52      (5.0%)        4.45     (12.1%)  -19.4% ( -34% -   -2%) 0.000
        PKLookup      110.45     (11.1%)      108.08     (15.7%)   -2.1% ( -26% -   27%) 0.617

= Candidate CPU JFR
PROFILE SUMMARY from 86268 events (total: 86268)
  tests.profile.mode=cpu
  tests.profile.count=30
  tests.profile.stacksize=1
  tests.profile.linenumbers=false
PERCENT       CPU SAMPLES   STACK
15.48%        13355         org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact()
7.37%         6359          org.apache.lucene.search.DisjunctionDISIApproximation#advance()
7.35%         6345          org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score()
6.38%         5508          org.apache.lucene.search.DisiPriorityQueue#downHeap()
4.18%         3605          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq()
4.10%         3538          org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect()
4.01%         3456          org.apache.lucene.search.DisiPriorityQueue#topList()
3.67%         3169          org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq()
3.36%         2899          org.apache.lucene.search.DisiPriorityQueue#top()
3.08%         2660          java.lang.Math#round()
2.40%         2067          org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue()
2.38%         2057          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score()
1.94%         1675          org.apache.lucene.util.SmallFloat#longToInt4()
1.87%         1617          org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue()
1.86%         1605          org.apache.lucene.store.ByteBufferGuard#ensureValid()
1.80%         1557          org.apache.lucene.codecs.lucene90.Lucene90PostingsReader#findFirstGreater()
1.50%         1293          org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#advance()
1.39%         1200          org.apache.lucene.search.ImpactsDISI#docID()
1.33%         1145          org.apache.lucene.search.DisiPriorityQueue#updateTop()
1.24%         1067          jdk.internal.misc.Unsafe#getByte()
1.18%         1019          org.apache.lucene.store.ByteBufferGuard#getByte()
0.92%         792           org.apache.lucene.util.SmallFloat#intToByte4()
0.76%         659           java.lang.Math#toIntExact()
0.68%         584           org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#freq()
0.67%         579           org.apache.lucene.search.ImpactsDISI#nextDoc()
0.61%         530           org.apache.lucene.search.ImpactsDISI#advanceTarget()
0.58%         500           org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll()
0.57%         492           org.apache.lucene.search.DisiPriorityQueue#prepend()
0.40%         345           org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32()
0.39%         333           org.apache.lucene.codecs.lucene90.ForUtil#expand8()

= Baseline CPU JFR
PROFILE SUMMARY from 76683 events (total: 76683)
  tests.profile.mode=cpu
  tests.profile.count=30
  tests.profile.stacksize=1
  tests.profile.linenumbers=false
PERCENT       CPU SAMPLES   STACK
16.26%        12468         org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact()
8.46%         6490          org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score()
8.24%         6315          org.apache.lucene.search.DisjunctionDISIApproximation#nextDoc()
6.13%         4698          org.apache.lucene.search.DisiPriorityQueue#downHeap()
4.79%         3676          org.apache.lucene.search.DisiPriorityQueue#topList()
4.45%         3412          org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect()
4.32%         3314          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq()
3.25%         2492          java.lang.Math#round()
3.01%         2309          org.apache.lucene.search.DisiPriorityQueue#top()
2.75%         2107          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score()
2.62%         2010          org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq()
2.29%         1753          org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll()
2.12%         1623          org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue()
2.05%         1569          org.apache.lucene.util.SmallFloat#longToInt4()
1.93%         1481          org.apache.lucene.store.ByteBufferGuard#ensureValid()
1.77%         1358          org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue()
1.45%         1110          org.apache.lucene.search.DisiPriorityQueue#updateTop()
1.25%         958           org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#nextDoc()
1.19%         914           org.apache.lucene.store.ByteBufferGuard#getByte()
1.17%         898           java.lang.Math#toIntExact()
1.06%         815           org.apache.lucene.util.SmallFloat#intToByte4()
1.02%         782           org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#freq()
0.86%         656           jdk.internal.misc.Unsafe#getByte()
0.68%         518           org.apache.lucene.codecs.lucene90.PForUtil#decodeAndPrefixSum()
0.64%         492           org.apache.lucene.search.DisiPriorityQueue#prepend()
0.49%         375           org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32()
0.42%         322           org.apache.lucene.codecs.lucene90.ForUtil#expand8()
0.31%         239           java.util.zip.Inflater#inflateBytesBytes()
0.30%         228           org.apache.lucene.codecs.lucene90.blocktree.SegmentTermsEnum#seekExact()
0.29%         224           org.apache.lucene.codecs.lucene90.PForUtil#expand32()
---
 .../lucene/search/ImpactsMergingUtils.java    |  59 ++++-
 .../apache/lucene/search/SynonymQuery.java    |   2 +-
 .../sandbox/search/CombinedFieldQuery.java    | 232 ++++--------------
 3 files changed, 102 insertions(+), 191 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
index 856ad31cb2b..2c50072f307 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.lucene.search;
 
 import java.util.ArrayList;
@@ -9,13 +25,22 @@
 import org.apache.lucene.index.Impacts;
 import org.apache.lucene.index.ImpactsEnum;
 import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.SmallFloat;
 
 /**
  * Utils for merging impacts for SynonymQuery, CombinedFieldsQuery etc
  *
  * @lucene.internal
  */
-public class ImpactsMergingUtils {
+public final class ImpactsMergingUtils {
+  /** Cache of decoded norms. */
+  private static final float[] LENGTH_TABLE = new float[256];
+
+  static {
+    for (int i = 0; i < 256; i++) {
+      LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
+    }
+  }
 
   /**
    * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, or {@code -1} if
@@ -50,12 +75,23 @@ void next() {
     }
   }
 
+  private static double normToLength(long norm) {
+    return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
+  }
+
   /**
-   * Merge impacts from multiple impactsEnum (terms matches) within the same field.
-   * The high level logic is to combine freqs that have the same norm from impacts.
+   * Merge impacts from multiple impactsEnum (terms matches) within the same field. The high level
+   * logic is to combine freqs that have the same norm from impacts.
    */
   public static List<Impact> mergeImpactsPerField(
-      ImpactsEnum[] impactsEnum, final Impacts[] impacts, float[] boosts, int docIdUpTo) {
+      ImpactsEnum[] impactsEnum,
+      Impacts[] impacts,
+      float[] termBoosts,
+      int docIdUpTo,
+      boolean combineMultiNorms) {
+    assert impactsEnum.length == impacts.length;
+    assert impactsEnum.length == termBoosts.length;
+
     List<List<Impact>> toMerge = new ArrayList<>();
 
     for (int i = 0; i < impactsEnum.length; ++i) {
@@ -67,11 +103,20 @@ public static List<Impact> mergeImpactsPerField(
           return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
         }
         final List<Impact> impactList;
-        if (boosts[i] != 1f) {
-          float boost = boosts[i];
+        if (termBoosts[i] != 1f) {
+          float boost = termBoosts[i];
           impactList =
               impacts[i].getImpacts(impactsLevel).stream()
-                  .map(impact -> new Impact((int) Math.ceil(impact.freq * boost), impact.norm))
+                  .map(
+                      impact -> {
+                        int boostedFreq = (int) Math.ceil(impact.freq * boost);
+                        long boostedNorm =
+                            combineMultiNorms
+                                ? SmallFloat.intToByte4(
+                                    (int) Math.floor(normToLength(impact.norm) * boost))
+                                : impact.norm;
+                        return new Impact(boostedFreq, boostedNorm);
+                      })
                   .collect(Collectors.toList());
         } else {
           impactList = impacts[i].getImpacts(impactsLevel);
diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
index 7905fa60ac3..66044ab412c 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
@@ -375,7 +375,7 @@ public int getDocIdUpTo(int level) {
           public List<Impact> getImpacts(int level) {
             final int docIdUpTo = getDocIdUpTo(level);
             return ImpactsMergingUtils.mergeImpactsPerField(
-                impactsEnums, impacts, boosts, docIdUpTo);
+                impactsEnums, impacts, boosts, docIdUpTo, false);
           }
         };
       }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index 9833a3e1a49..6b6cfd14a9c 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -22,8 +22,6 @@
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
@@ -51,6 +49,7 @@
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.ImpactsDISI;
+import org.apache.lucene.search.ImpactsMergingUtils;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.LeafSimScorer;
 import org.apache.lucene.search.Matches;
@@ -68,7 +67,6 @@
 import org.apache.lucene.search.similarities.SimilarityBase;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.SmallFloat;
 
@@ -102,15 +100,6 @@
  * @lucene.experimental
  */
 public final class CombinedFieldQuery extends Query implements Accountable {
-  /** Cache of decoded norms. */
-  private static final float[] LENGTH_TABLE = new float[256];
-
-  static {
-    for (int i = 0; i < 256; i++) {
-      LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
-    }
-  }
-
   private static final long BASE_RAM_BYTES =
       RamUsageEstimator.shallowSizeOfInstance(CombinedFieldQuery.class);
 
@@ -423,8 +412,7 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
     public Scorer scorer(LeafReaderContext context) throws IOException {
       List<PostingsEnum> iterators = new ArrayList<>();
       List<FieldAndWeight> fields = new ArrayList<>();
-      Map<String, List<ImpactsEnum>> fieldImpactsEnum = new HashMap<>(fieldAndWeights.size());
-      Map<String, List<Impacts>> fieldImpacts = new HashMap<>(fieldAndWeights.size());
+      Map<String, List<ImpactsEnum>> tempFieldImpactsEnums = new HashMap<>(fieldAndWeights.size());
 
       float maxWeight = Float.MIN_VALUE;
       String maxWeightField = "";
@@ -433,7 +421,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
         if (state != null) {
           String fieldName = fieldTerms[i].field();
           fields.add(fieldAndWeights.get(fieldName));
-          fieldImpactsEnum.putIfAbsent(fieldName, new ArrayList<>());
+          tempFieldImpactsEnums.putIfAbsent(fieldName, new ArrayList<>());
 
           TermsEnum termsEnum = context.reader().terms(fieldName).iterator();
           termsEnum.seekExact(fieldTerms[i].bytes(), state);
@@ -447,7 +435,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
 
             ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
             iterators.add(impactsEnum);
-            fieldImpactsEnum.get(fieldName).add(impactsEnum);
+            tempFieldImpactsEnums.get(fieldName).add(impactsEnum);
           } else {
             PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS);
             iterators.add(postingsEnum);
@@ -466,14 +454,8 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
 
       // we use termscorers + disjunction as an impl detail
       DisiPriorityQueue queue = new DisiPriorityQueue(iterators.size());
-      Map<String, Float> fieldWeights = new HashMap<>(fieldImpactsEnum.size());
       for (int i = 0; i < iterators.size(); i++) {
         FieldAndWeight fieldAndWeight = fields.get(i);
-        if (fieldWeights.containsKey(fieldAndWeight.field)) {
-          assert fieldWeights.get(fieldAndWeight.field).equals(fieldAndWeight.weight);
-        } else {
-          fieldWeights.put(fieldAndWeight.field, fieldAndWeight.weight);
-        }
         queue.add(
             new WeightedDisiWrapper(
                 new TermScorer(this, iterators.get(i), nonScoringSimScorer),
@@ -486,8 +468,19 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
       ImpactsDISI impactsDisi = null;
 
       if (scoreMode == ScoreMode.TOP_SCORES) {
-        ImpactsSource impactsSource =
-            mergeImpacts(fieldImpactsEnum, fieldImpacts, fieldWeights, maxWeightField);
+
+        Map<String, ImpactsEnum[]> fieldImpactsEnums = new HashMap<>();
+        Map<String, float[]> fieldWeights = new HashMap<>();
+
+        for (Map.Entry<String, List<ImpactsEnum>> e : tempFieldImpactsEnums.entrySet()) {
+          fieldImpactsEnums.put(e.getKey(), e.getValue().toArray(new ImpactsEnum[0]));
+
+          float[] weights = new float[e.getValue().size()];
+          Arrays.fill(weights, fieldAndWeights.get(e.getKey()).weight);
+          fieldWeights.put(e.getKey(), weights);
+        }
+
+        ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnums, fieldWeights, maxWeightField);
         iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight);
       }
 
@@ -502,33 +495,14 @@ public boolean isCacheable(LeafReaderContext ctx) {
 
   /** Merge impacts for combined field. */
   static ImpactsSource mergeImpacts(
-      Map<String, List<ImpactsEnum>> fieldsWithImpactsEnums,
-      Map<String, List<Impacts>> fieldsWithImpacts,
-      Map<String, Float> fieldWeights,
+      Map<String, ImpactsEnum[]> fieldImpactsEnum,
+      Map<String, float[]> fieldWeights,
       String maxWeightField) {
+
     return new ImpactsSource() {
+      final Map<String, Impacts[]> fieldImpacts = new HashMap<>();
       Impacts leadingImpacts = null;
 
-      class SubIterator {
-        final Iterator<Impact> iterator;
-        int previousFreq;
-        Impact current;
-
-        SubIterator(Iterator<Impact> iterator) {
-          this.iterator = iterator;
-          this.current = iterator.next();
-        }
-
-        void next() {
-          previousFreq = current.freq;
-          if (iterator.hasNext() == false) {
-            current = null;
-          } else {
-            current = iterator.next();
-          }
-        }
-      }
-
       @Override
       public Impacts getImpacts() throws IOException {
         // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for
@@ -536,42 +510,40 @@ public Impacts getImpacts() throws IOException {
         // They collectively will decide on the number of levels and the block boundaries.
 
         if (leadingImpacts == null) {
+          fieldImpacts.clear();
+
           Impacts tmpLead = null;
-          for (Map.Entry<String, List<ImpactsEnum>> fieldImpacts :
-              fieldsWithImpactsEnums.entrySet()) {
-            String field = fieldImpacts.getKey();
-            List<ImpactsEnum> impactsEnums = fieldImpacts.getValue();
-            //            List<Integer> docFreqs = fieldTermDocFreq.get(field);
-            fieldsWithImpacts.put(field, new ArrayList<>(impactsEnums.size()));
+          for (Map.Entry<String, ImpactsEnum[]> e : fieldImpactsEnum.entrySet()) {
+            String field = e.getKey();
+            ImpactsEnum[] impactsEnums = e.getValue();
+            Impacts[] impacts = new Impacts[impactsEnums.length];
+            fieldImpacts.put(field, impacts);
 
             if (field.equals(maxWeightField)) {
               long minCost = Long.MAX_VALUE;
-              for (int i = 0; i < impactsEnums.size(); ++i) {
-                ImpactsEnum impactsEnum = impactsEnums.get(i);
-                Impacts impacts = impactsEnum.getImpacts();
-                fieldsWithImpacts.get(field).add(impacts);
+              for (int i = 0; i < impactsEnums.length; ++i) {
+                ImpactsEnum impactsEnum = impactsEnums[i];
+                impacts[i] = impactsEnum.getImpacts();
 
                 // use the impact of term within this mostly weighted field that has the least doc
                 // freq (cost)
                 // this may have the result of getting larger upTo bound, as low doc freq (cost)
                 // term may
                 // also have large gap in doc ids
-                //                long currentCost = impactsEnums.get(i).cost();
                 if (tmpLead == null || impactsEnum.cost() < minCost) {
                   minCost = impactsEnum.cost();
-                  tmpLead = impacts;
+                  tmpLead = impacts[i];
                 }
 
                 //                if (tmpLead == null || impacts.getDocIdUpTo(0) <
                 // tmpLead.getDocIdUpTo(0)) {
-                //                  tmpLead = impacts;
+                //                  tmpLead = impacts[i];
                 //                }
               }
             } else {
               // find the impact that has the lowest next boundary for this field
-              for (int i = 0; i < impactsEnums.size(); ++i) {
-                Impacts impacts = impactsEnums.get(i).getImpacts();
-                fieldsWithImpacts.get(field).add(impacts);
+              for (int i = 0; i < impactsEnums.length; ++i) {
+                impacts[i] = impactsEnums[i].getImpacts();
               }
             }
           }
@@ -593,22 +565,25 @@ public int getDocIdUpTo(int level) {
           @Override
           public List<Impact> getImpacts(int level) {
             final int docIdUpTo = getDocIdUpTo(level);
-            final Map<String, List<Impact>> mergedImpactsPerField = mergeImpactsPerField(docIdUpTo);
+            final Map<String, List<Impact>> mergedImpactsPerField =
+                getMergedImpactsPerField(docIdUpTo);
 
             return mergeImpactsAcrossFields(mergedImpactsPerField);
           }
 
-          private Map<String, List<Impact>> mergeImpactsPerField(int docIdUpTo) {
-            final Map<String, List<Impact>> result = new HashMap<>(fieldsWithImpactsEnums.size());
-
-            for (Map.Entry<String, List<ImpactsEnum>> impactsPerField :
-                fieldsWithImpactsEnums.entrySet()) {
-              String field = impactsPerField.getKey();
-              List<ImpactsEnum> impactsEnums = impactsPerField.getValue();
-              List<Impacts> impacts = fieldsWithImpacts.get(field);
+          private Map<String, List<Impact>> getMergedImpactsPerField(int docIdUpTo) {
+            final Map<String, List<Impact>> result = new HashMap<>(fieldImpactsEnum.size());
 
+            for (Map.Entry<String, ImpactsEnum[]> e : fieldImpactsEnum.entrySet()) {
+              String field = e.getKey();
+              ImpactsEnum[] impactsEnums = e.getValue();
               List<Impact> mergedImpacts =
-                  doMergeImpactsPerField(field, impactsEnums, impacts, docIdUpTo);
+                  ImpactsMergingUtils.mergeImpactsPerField(
+                      impactsEnums,
+                      fieldImpacts.get(field),
+                      fieldWeights.get(field),
+                      docIdUpTo,
+                      true);
 
               if (mergedImpacts.size() == 0) {
                 // all impactEnums for this field were positioned beyond docIdUpTo, continue to next
@@ -627,111 +602,6 @@ private Map<String, List<Impact>> mergeImpactsPerField(int docIdUpTo) {
             return result;
           }
 
-          // Merge impacts from same field by summing freqs with the same norms - the same logic
-          // used for SynonymQuery
-          private List<Impact> doMergeImpactsPerField(
-              String field, List<ImpactsEnum> impactsEnums, List<Impacts> impacts, int docIdUpTo) {
-            List<List<Impact>> toMerge = new ArrayList<>(impactsEnums.size());
-
-            for (int i = 0; i < impactsEnums.size(); ++i) {
-              if (impactsEnums.get(i).docID() <= docIdUpTo) {
-                int impactsLevel = getLevel(impacts.get(i), docIdUpTo);
-                if (impactsLevel == -1) {
-                  // one instance doesn't have impacts that cover up to docIdUpTo
-                  // return impacts that trigger the maximum score
-                  return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
-                }
-                float weight = fieldWeights.get(field);
-                if (weight != 1f) {
-                  final List<Impact> originalImpactList = impacts.get(i).getImpacts(impactsLevel);
-                  final List<Impact> impactList = new ArrayList<>(originalImpactList.size());
-                  for (Impact impact : originalImpactList) {
-                    impactList.add(
-                        new Impact(
-                            (int) Math.ceil(impact.freq * weight),
-                            SmallFloat.intToByte4(
-                                (int) Math.floor(normToLength(impact.norm) * weight))));
-                  }
-                  toMerge.add(impactList);
-                } else {
-                  toMerge.add(impacts.get(i).getImpacts(impactsLevel));
-                }
-              }
-            }
-
-            // all impactEnums for this field were positioned beyond docIdUpTo
-            if (toMerge.size() == 0) {
-              return new ArrayList<>();
-            }
-
-            if (toMerge.size() == 1) {
-              // common if one term is common and the other one is rare
-              return toMerge.get(0);
-            }
-
-            PriorityQueue<SubIterator> pq =
-                new PriorityQueue<SubIterator>(impacts.size()) {
-                  @Override
-                  protected boolean lessThan(SubIterator a, SubIterator b) {
-                    if (a.current == null) { // means iteration is finished
-                      return false;
-                    }
-                    if (b.current == null) {
-                      return true;
-                    }
-                    return Long.compareUnsigned(a.current.norm, b.current.norm) < 0;
-                  }
-                };
-
-            for (List<Impact> toMergeImpacts : toMerge) {
-              pq.add(new SubIterator(toMergeImpacts.iterator()));
-            }
-
-            List<Impact> mergedImpacts = new LinkedList<>();
-
-            // Idea: merge impacts by norm. The tricky thing is that we need to
-            // consider norm values that are not in the impacts too. For
-            // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
-            // there might well be a document that has a freq of 2 and a length of 11,
-            // which was just not added to the list of impacts because {freq=2,norm=10}
-            // is more competitive. So the way it works is that we track the sum of
-            // the term freqs that we have seen so far in order to account for these
-            // implicit impacts.
-
-            long sumTf = 0;
-            SubIterator top = pq.top();
-            do {
-              final long norm = top.current.norm;
-              do {
-                sumTf += top.current.freq - top.previousFreq;
-                top.next();
-                top = pq.updateTop();
-              } while (top.current != null && top.current.norm == norm);
-
-              final int freqUpperBound = (int) Math.min(Integer.MAX_VALUE, sumTf);
-              if (mergedImpacts.isEmpty()) {
-                mergedImpacts.add(new Impact(freqUpperBound, norm));
-              } else {
-                Impact prevImpact = mergedImpacts.get(mergedImpacts.size() - 1);
-                assert Long.compareUnsigned(prevImpact.norm, norm) < 0;
-                if (freqUpperBound > prevImpact.freq) {
-                  mergedImpacts.add(new Impact(freqUpperBound, norm));
-                } // otherwise the previous impact is already more competitive
-              }
-            } while (top.current != null);
-
-            return mergedImpacts;
-          }
-
-          private int getLevel(Impacts impacts, int docIdUpTo) {
-            for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
-              if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
-                return level;
-              }
-            }
-            return -1;
-          }
-
           private List<Impact> mergeImpactsAcrossFields(
               Map<String, List<Impact>> mergedImpactsPerField) {
             if (mergedImpactsPerField.size() == 1) {
@@ -758,13 +628,9 @@ private List<Impact> mergeImpactsAcrossFields(
         };
       }
 
-      private float normToLength(long norm) {
-        return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
-      }
-
       @Override
       public void advanceShallow(int target) throws IOException {
-        for (List<ImpactsEnum> impactsEnums : fieldsWithImpactsEnums.values()) {
+        for (ImpactsEnum[] impactsEnums : fieldImpactsEnum.values()) {
           for (ImpactsEnum impactsEnum : impactsEnums) {
             if (impactsEnum.docID() < target) {
               impactsEnum.advanceShallow(target);

From 75c5b046f8af4e73341a26a1828e97e006115a6b Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Thu, 2 Dec 2021 00:17:16 -0800
Subject: [PATCH 14/16] Use getDocIdUpTo to determine lead impacts

            TaskQPS baseline      StdDevQPS my_modified_version      StdDev                Pct diff p-value
     CFQHighHigh        3.68      (6.5%)        2.66      (7.3%)  -27.8% ( -39% -  -14%) 0.000
      CFQHighLow       14.19      (7.9%)       11.31      (8.0%)  -20.3% ( -33% -   -4%) 0.000
      CFQHighMed       12.92      (9.5%)       10.41      (9.9%)  -19.4% ( -35% -    0%) 0.000
        PKLookup      104.92      (6.7%)      104.49      (9.0%)   -0.4% ( -15% -   16%) 0.869

= Candidate CPU JFR
PROFILE SUMMARY from 87201 events (total: 87201)
  tests.profile.mode=cpu
  tests.profile.count=30
  tests.profile.stacksize=1
  tests.profile.linenumbers=false
PERCENT       CPU SAMPLES   STACK
15.13%        13191         org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact()
7.42%         6466          org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score()
7.29%         6354          org.apache.lucene.search.DisjunctionDISIApproximation#advance()
6.79%         5921          org.apache.lucene.search.DisiPriorityQueue#downHeap()
4.34%         3785          org.apache.lucene.search.DisiPriorityQueue#topList()
3.97%         3465          org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect()
3.85%         3360          org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq()
3.18%         2777          java.lang.Math#round()
3.01%         2624          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq()
3.01%         2624          org.apache.lucene.search.DisiPriorityQueue#top()
2.33%         2036          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score()
2.20%         1917          org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue()
2.07%         1809          org.apache.lucene.util.SmallFloat#longToInt4()
1.81%         1580          org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue()
1.78%         1555          org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#advance()
1.77%         1545          org.apache.lucene.store.ByteBufferGuard#ensureValid()
1.51%         1316          org.apache.lucene.codecs.lucene90.Lucene90PostingsReader#findFirstGreater()
1.39%         1210          org.apache.lucene.search.DisiPriorityQueue#updateTop()
1.29%         1124          org.apache.lucene.search.ImpactsDISI#docID()
1.22%         1063          jdk.internal.misc.Unsafe#getByte()
1.19%         1037          org.apache.lucene.store.ByteBufferGuard#getByte()
1.08%         938           org.apache.lucene.search.DisiPriorityQueue#prepend()
0.97%         849           org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll()
0.86%         750           org.apache.lucene.codecs.MultiLevelSkipListReader#skipTo()
0.82%         717           org.apache.lucene.util.SmallFloat#intToByte4()
0.78%         683           java.lang.Math#toIntExact()
0.62%         540           org.apache.lucene.codecs.lucene90.PForUtil#decode()
0.61%         529           org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsDocsEnum#freq()
0.51%         441           org.apache.lucene.search.ImpactsDISI#nextDoc()
0.50%         435           org.apache.lucene.search.ImpactsDISI#advanceTarget()

= Baseline CPU JFR
PROFILE SUMMARY from 81578 events (total: 81578)
  tests.profile.mode=cpu
  tests.profile.count=30
  tests.profile.stacksize=1
  tests.profile.linenumbers=false
PERCENT       CPU SAMPLES   STACK
16.38%        13363         org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer$MultiFieldNormValues#advanceExact()
8.67%         7073          org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer#score()
8.05%         6569          org.apache.lucene.search.DisjunctionDISIApproximation#nextDoc()
6.50%         5305          org.apache.lucene.search.DisiPriorityQueue#downHeap()
5.17%         4220          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#freq()
4.40%         3587          org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1#collect()
4.30%         3505          org.apache.lucene.search.DisiPriorityQueue#topList()
3.37%         2751          java.lang.Math#round()
2.77%         2263          org.apache.lucene.search.DisiPriorityQueue#top()
2.64%         2156          org.apache.lucene.sandbox.search.CombinedFieldQuery$WeightedDisiWrapper#freq()
2.62%         2135          org.apache.lucene.sandbox.search.CombinedFieldQuery$CombinedFieldScorer#score()
2.12%         1729          org.apache.lucene.sandbox.search.MultiNormsLeafSimScorer#getNormValue()
2.11%         1723          org.apache.lucene.util.SmallFloat#longToInt4()
2.06%         1678          org.apache.lucene.search.Weight$DefaultBulkScorer#scoreAll()
2.01%         1640          org.apache.lucene.store.ByteBufferGuard#ensureValid()
1.77%         1442          org.apache.lucene.codecs.lucene90.Lucene90NormsProducer$3#longValue()
1.62%         1325          org.apache.lucene.search.DisiPriorityQueue#updateTop()
1.26%         1028          org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#nextDoc()
1.10%         899           java.lang.Math#toIntExact()
1.10%         898           org.apache.lucene.util.SmallFloat#intToByte4()
1.09%         888           org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum#freq()
1.06%         864           org.apache.lucene.store.ByteBufferGuard#getByte()
0.80%         652           jdk.internal.misc.Unsafe#getByte()
0.74%         605           org.apache.lucene.codecs.lucene90.PForUtil#decode()
0.68%         552           org.apache.lucene.search.DisiPriorityQueue#prepend()
0.60%         492           org.apache.lucene.codecs.lucene90.PForUtil#decodeAndPrefixSum()
0.55%         451           java.io.RandomAccessFile#readBytes()
0.38%         314           org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32()
0.38%         307           org.apache.lucene.codecs.lucene90.ForUtil#expand8()
0.32%         260           org.apache.lucene.codecs.lucene90.PForUtil#expand32()
---
 .../sandbox/search/CombinedFieldQuery.java       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index 6b6cfd14a9c..8583450f1b2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -520,7 +520,7 @@ public Impacts getImpacts() throws IOException {
             fieldImpacts.put(field, impacts);
 
             if (field.equals(maxWeightField)) {
-              long minCost = Long.MAX_VALUE;
+//              long minCost = Long.MAX_VALUE;
               for (int i = 0; i < impactsEnums.length; ++i) {
                 ImpactsEnum impactsEnum = impactsEnums[i];
                 impacts[i] = impactsEnum.getImpacts();
@@ -530,15 +530,15 @@ public Impacts getImpacts() throws IOException {
                 // this may have the result of getting larger upTo bound, as low doc freq (cost)
                 // term may
                 // also have large gap in doc ids
-                if (tmpLead == null || impactsEnum.cost() < minCost) {
-                  minCost = impactsEnum.cost();
+//                if (tmpLead == null || impactsEnum.cost() < minCost) {
+//                  minCost = impactsEnum.cost();
+//                  tmpLead = impacts[i];
+//                }
+
+                if (tmpLead == null || impacts[i].getDocIdUpTo(0) <
+                        tmpLead.getDocIdUpTo(0)) {
                   tmpLead = impacts[i];
                 }
-
-                //                if (tmpLead == null || impacts.getDocIdUpTo(0) <
-                // tmpLead.getDocIdUpTo(0)) {
-                //                  tmpLead = impacts[i];
-                //                }
               }
             } else {
               // find the impact that has the lowest next boundary for this field

From cbbd28bb2964228542e72b112f3b2f5f3164fae7 Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Tue, 7 Dec 2021 20:03:26 -0800
Subject: [PATCH 15/16] Use leading impacts across fields

---
 .../lucene/search/ImpactsMergingUtils.java    |  8 +-
 .../sandbox/search/CombinedFieldQuery.java    | 77 ++++++++-----------
 2 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
index 2c50072f307..049fae8c6d9 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ImpactsMergingUtils.java
@@ -124,7 +124,13 @@ public static List<Impact> mergeImpactsPerField(
         toMerge.add(impactList);
       }
     }
-    assert toMerge.size() > 0; // otherwise it would mean the docID is > docIdUpTo, which is wrong
+
+    // all impactEnums for this field were positioned beyond docIdUpTo, which is possible when
+    // 1. there are multiple fields involved.
+    // 2. docIdUpTo was taken from minimum from all impactEnums across fields
+    if (toMerge.size() == 0) {
+      return new ArrayList<>();
+    }
 
     if (toMerge.size() == 1) {
       // common if one synonym is common and the other one is rare
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index 8583450f1b2..d3a2b022afe 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -415,7 +415,6 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
       Map<String, List<ImpactsEnum>> tempFieldImpactsEnums = new HashMap<>(fieldAndWeights.size());
 
       float maxWeight = Float.MIN_VALUE;
-      String maxWeightField = "";
       for (int i = 0; i < fieldTerms.length; i++) {
         TermState state = termStates[i].get(context);
         if (state != null) {
@@ -427,12 +426,6 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
           termsEnum.seekExact(fieldTerms[i].bytes(), state);
 
           if (scoreMode == ScoreMode.TOP_SCORES) {
-            float weight = fieldAndWeights.get(fieldName).weight;
-            if (maxWeight < weight) {
-              maxWeight = weight;
-              maxWeightField = fieldName;
-            }
-
             ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
             iterators.add(impactsEnum);
             tempFieldImpactsEnums.get(fieldName).add(impactsEnum);
@@ -480,7 +473,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
           fieldWeights.put(e.getKey(), weights);
         }
 
-        ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnums, fieldWeights, maxWeightField);
+        ImpactsSource impactsSource = mergeImpacts(fieldImpactsEnums, fieldWeights);
         iterator = impactsDisi = new ImpactsDISI(iterator, impactsSource, simWeight);
       }
 
@@ -495,13 +488,11 @@ public boolean isCacheable(LeafReaderContext ctx) {
 
   /** Merge impacts for combined field. */
   static ImpactsSource mergeImpacts(
-      Map<String, ImpactsEnum[]> fieldImpactsEnum,
-      Map<String, float[]> fieldWeights,
-      String maxWeightField) {
+      Map<String, ImpactsEnum[]> fieldImpactsEnum, Map<String, float[]> fieldWeights) {
 
     return new ImpactsSource() {
       final Map<String, Impacts[]> fieldImpacts = new HashMap<>();
-      Impacts leadingImpacts = null;
+      Map<String, Impacts> leadingImpactsPerField = null;
 
       @Override
       public Impacts getImpacts() throws IOException {
@@ -509,57 +500,57 @@ public Impacts getImpacts() throws IOException {
         // each field
         // They collectively will decide on the number of levels and the block boundaries.
 
-        if (leadingImpacts == null) {
+        if (leadingImpactsPerField == null) {
+          leadingImpactsPerField = new HashMap<>(fieldImpactsEnum.size());
           fieldImpacts.clear();
 
-          Impacts tmpLead = null;
           for (Map.Entry<String, ImpactsEnum[]> e : fieldImpactsEnum.entrySet()) {
             String field = e.getKey();
             ImpactsEnum[] impactsEnums = e.getValue();
             Impacts[] impacts = new Impacts[impactsEnums.length];
             fieldImpacts.put(field, impacts);
 
-            if (field.equals(maxWeightField)) {
-//              long minCost = Long.MAX_VALUE;
-              for (int i = 0; i < impactsEnums.length; ++i) {
-                ImpactsEnum impactsEnum = impactsEnums[i];
-                impacts[i] = impactsEnum.getImpacts();
-
-                // use the impact of term within this mostly weighted field that has the least doc
-                // freq (cost)
-                // this may have the result of getting larger upTo bound, as low doc freq (cost)
-                // term may
-                // also have large gap in doc ids
-//                if (tmpLead == null || impactsEnum.cost() < minCost) {
-//                  minCost = impactsEnum.cost();
-//                  tmpLead = impacts[i];
-//                }
-
-                if (tmpLead == null || impacts[i].getDocIdUpTo(0) <
-                        tmpLead.getDocIdUpTo(0)) {
-                  tmpLead = impacts[i];
-                }
-              }
-            } else {
-              // find the impact that has the lowest next boundary for this field
-              for (int i = 0; i < impactsEnums.length; ++i) {
-                impacts[i] = impactsEnums[i].getImpacts();
+            Impacts tmpLead = null;
+            // find the impact that has the lowest next boundary for this field
+            for (int i = 0; i < impactsEnums.length; ++i) {
+              Impacts currentImpacts = impactsEnums[i].getImpacts();
+              impacts[i] = currentImpacts;
+
+              if (tmpLead == null || currentImpacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
+                tmpLead = currentImpacts;
               }
             }
+
+            leadingImpactsPerField.put(field, tmpLead);
           }
-          leadingImpacts = tmpLead;
         }
 
         return new Impacts() {
 
           @Override
           public int numLevels() {
-            return leadingImpacts.numLevels();
+            // max of levels across fields' impactEnums
+            int result = 0;
+
+            for (Impacts impacts : leadingImpactsPerField.values()) {
+              result = Math.max(result, impacts.numLevels());
+            }
+
+            return result;
           }
 
           @Override
           public int getDocIdUpTo(int level) {
-            return leadingImpacts.getDocIdUpTo(level);
+            // min of docIdUpTo across fields' impactEnums
+            int result = Integer.MAX_VALUE;
+
+            for (Impacts impacts : leadingImpactsPerField.values()) {
+              if (impacts.numLevels() > level) {
+                result = Math.min(result, impacts.getDocIdUpTo(level));
+              }
+            }
+
+            return result;
           }
 
           @Override
@@ -637,7 +628,7 @@ public void advanceShallow(int target) throws IOException {
             }
           }
         }
-        leadingImpacts = null;
+        leadingImpactsPerField = null;
       }
     };
   }

From 502d44e7ff0b6de6061a4899589e4523e9bf74e8 Mon Sep 17 00:00:00 2001
From: Zach Chen <zacharymorn@yahoo.com>
Date: Tue, 7 Dec 2021 21:27:06 -0800
Subject: [PATCH 16/16] Use higher abstraction SynonymImpactsSource in
 CombinedFieldQuery

---
 .../lucene/search/SynonymImpactsSource.java   | 88 +++++++++++++++++++
 .../apache/lucene/search/SynonymQuery.java    | 50 +----------
 .../sandbox/search/CombinedFieldQuery.java    | 66 +++++++-------
 3 files changed, 120 insertions(+), 84 deletions(-)
 create mode 100644 lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java

diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java b/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java
new file mode 100644
index 00000000000..4366c012670
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymImpactsSource.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.List;
+import org.apache.lucene.index.Impact;
+import org.apache.lucene.index.Impacts;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.ImpactsSource;
+
+public class SynonymImpactsSource implements ImpactsSource {
+
+  private final ImpactsEnum[] impactsEnums;
+  private final Impacts[] impacts;
+  private final float[] boosts;
+  private Impacts lead;
+
+  public SynonymImpactsSource(ImpactsEnum[] impactsEnums, float[] boosts) {
+    this.impactsEnums = impactsEnums;
+    this.boosts = boosts;
+    this.impacts = new Impacts[impactsEnums.length];
+  }
+
+  @Override
+  public Impacts getImpacts() throws IOException {
+    // Use the impacts that have the lower next boundary as a lead.
+    // It will decide on the number of levels and the block boundaries.
+    if (lead == null) {
+      Impacts tmpLead = null;
+      for (int i = 0; i < impactsEnums.length; ++i) {
+        impacts[i] = impactsEnums[i].getImpacts();
+        if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
+          tmpLead = impacts[i];
+        }
+      }
+      lead = tmpLead;
+    }
+    return new Impacts() {
+
+      @Override
+      public int numLevels() {
+        // Delegate to the lead
+        return lead.numLevels();
+      }
+
+      @Override
+      public int getDocIdUpTo(int level) {
+        // Delegate to the lead
+        return lead.getDocIdUpTo(level);
+      }
+
+      @Override
+      public List<Impact> getImpacts(int level) {
+        final int docIdUpTo = getDocIdUpTo(level);
+        return ImpactsMergingUtils.mergeImpactsPerField(
+            impactsEnums, impacts, boosts, docIdUpTo, false);
+      }
+    };
+  }
+
+  @Override
+  public void advanceShallow(int target) throws IOException {
+    for (ImpactsEnum impactsEnum : impactsEnums) {
+      if (impactsEnum.docID() < target) {
+        impactsEnum.advanceShallow(target);
+      }
+    }
+  }
+
+  public Impacts[] impacts() {
+    return impacts;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
index 66044ab412c..f8e38a07d24 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
@@ -24,8 +24,6 @@
 import java.util.List;
 import java.util.Objects;
 import java.util.stream.Collectors;
-import org.apache.lucene.index.Impact;
-import org.apache.lucene.index.Impacts;
 import org.apache.lucene.index.ImpactsEnum;
 import org.apache.lucene.index.ImpactsSource;
 import org.apache.lucene.index.IndexReader;
@@ -342,53 +340,7 @@ public boolean isCacheable(LeafReaderContext ctx) {
   /** Merge impacts for multiple synonyms. */
   static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums, float[] boosts) {
     assert impactsEnums.length == boosts.length;
-    return new ImpactsSource() {
-
-      @Override
-      public Impacts getImpacts() throws IOException {
-        final Impacts[] impacts = new Impacts[impactsEnums.length];
-        // Use the impacts that have the lower next boundary as a lead.
-        // It will decide on the number of levels and the block boundaries.
-        Impacts tmpLead = null;
-        for (int i = 0; i < impactsEnums.length; ++i) {
-          impacts[i] = impactsEnums[i].getImpacts();
-          if (tmpLead == null || impacts[i].getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
-            tmpLead = impacts[i];
-          }
-        }
-        final Impacts lead = tmpLead;
-        return new Impacts() {
-
-          @Override
-          public int numLevels() {
-            // Delegate to the lead
-            return lead.numLevels();
-          }
-
-          @Override
-          public int getDocIdUpTo(int level) {
-            // Delegate to the lead
-            return lead.getDocIdUpTo(level);
-          }
-
-          @Override
-          public List<Impact> getImpacts(int level) {
-            final int docIdUpTo = getDocIdUpTo(level);
-            return ImpactsMergingUtils.mergeImpactsPerField(
-                impactsEnums, impacts, boosts, docIdUpTo, false);
-          }
-        };
-      }
-
-      @Override
-      public void advanceShallow(int target) throws IOException {
-        for (ImpactsEnum impactsEnum : impactsEnums) {
-          if (impactsEnum.docID() < target) {
-            impactsEnum.advanceShallow(target);
-          }
-        }
-      }
-    };
+    return new SynonymImpactsSource(impactsEnums, boosts);
   }
 
   private static class SynonymScorer extends Scorer {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index d3a2b022afe..d8ed894f150 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -57,6 +57,7 @@
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.SynonymImpactsSource;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermScorer;
 import org.apache.lucene.search.TermStatistics;
@@ -491,49 +492,32 @@ static ImpactsSource mergeImpacts(
       Map<String, ImpactsEnum[]> fieldImpactsEnum, Map<String, float[]> fieldWeights) {
 
     return new ImpactsSource() {
-      final Map<String, Impacts[]> fieldImpacts = new HashMap<>();
-      Map<String, Impacts> leadingImpactsPerField = null;
+      Map<String, SynonymImpactsSource> fieldImpactsSource = null;
 
       @Override
       public Impacts getImpacts() throws IOException {
-        // Use the impacts that have the lower next boundary (doc id in skip entry) as a lead for
-        // each field
-        // They collectively will decide on the number of levels and the block boundaries.
-
-        if (leadingImpactsPerField == null) {
-          leadingImpactsPerField = new HashMap<>(fieldImpactsEnum.size());
-          fieldImpacts.clear();
-
+        if (fieldImpactsSource == null) {
+          fieldImpactsSource = new HashMap<>();
           for (Map.Entry<String, ImpactsEnum[]> e : fieldImpactsEnum.entrySet()) {
-            String field = e.getKey();
-            ImpactsEnum[] impactsEnums = e.getValue();
-            Impacts[] impacts = new Impacts[impactsEnums.length];
-            fieldImpacts.put(field, impacts);
-
-            Impacts tmpLead = null;
-            // find the impact that has the lowest next boundary for this field
-            for (int i = 0; i < impactsEnums.length; ++i) {
-              Impacts currentImpacts = impactsEnums[i].getImpacts();
-              impacts[i] = currentImpacts;
-
-              if (tmpLead == null || currentImpacts.getDocIdUpTo(0) < tmpLead.getDocIdUpTo(0)) {
-                tmpLead = currentImpacts;
-              }
-            }
-
-            leadingImpactsPerField.put(field, tmpLead);
+            SynonymImpactsSource source =
+                new SynonymImpactsSource(e.getValue(), fieldWeights.get(e.getKey()));
+            fieldImpactsSource.put(e.getKey(), source);
           }
         }
 
         return new Impacts() {
-
           @Override
           public int numLevels() {
             // max of levels across fields' impactEnums
             int result = 0;
 
-            for (Impacts impacts : leadingImpactsPerField.values()) {
-              result = Math.max(result, impacts.numLevels());
+            for (SynonymImpactsSource s : fieldImpactsSource.values()) {
+              try {
+                result = Math.max(result, s.getImpacts().numLevels());
+              } catch (IOException e) {
+                // nocommit to be handled
+                e.printStackTrace();
+              }
             }
 
             return result;
@@ -544,15 +528,26 @@ public int getDocIdUpTo(int level) {
             // min of docIdUpTo across fields' impactEnums
             int result = Integer.MAX_VALUE;
 
-            for (Impacts impacts : leadingImpactsPerField.values()) {
-              if (impacts.numLevels() > level) {
-                result = Math.min(result, impacts.getDocIdUpTo(level));
+            for (SynonymImpactsSource s : fieldImpactsSource.values()) {
+              Impacts impacts;
+              try {
+                impacts = s.getImpacts();
+                if (impacts.numLevels() > level) {
+                  result = Math.min(result, impacts.getDocIdUpTo(level));
+                }
+              } catch (IOException e) {
+                // nocommit to be handled
+                e.printStackTrace();
               }
             }
 
             return result;
           }
 
+          // this can't loop over each field's SynonymImpactsSource.getImpacts().getImpacts(level)
+          // and then combine impacts,
+          // as docIdUpTo of each SynonymImpactsSource.getImpacts().getImpacts(level) might be
+          // different for the same level
           @Override
           public List<Impact> getImpacts(int level) {
             final int docIdUpTo = getDocIdUpTo(level);
@@ -571,7 +566,7 @@ private Map<String, List<Impact>> getMergedImpactsPerField(int docIdUpTo) {
               List<Impact> mergedImpacts =
                   ImpactsMergingUtils.mergeImpactsPerField(
                       impactsEnums,
-                      fieldImpacts.get(field),
+                      fieldImpactsSource.get(field).impacts(),
                       fieldWeights.get(field),
                       docIdUpTo,
                       true);
@@ -628,7 +623,8 @@ public void advanceShallow(int target) throws IOException {
             }
           }
         }
-        leadingImpactsPerField = null;
+
+        fieldImpactsSource = null;
       }
     };
   }