apache · cbuescher · Sep 11, 2019 · Oct 2, 2019 · Oct 2, 2019 · Oct 2, 2019
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@@ -460,11 +460,6 @@ public TopResults<T> search() throws IOException {
           continue;
         }
 
-        if (results.size() == topN-1 && maxQueueDepth == topN) {
-          // Last path -- don't bother w/ queue anymore:
-          queue = null;
-        }
-
         // We take path and find its "0 output completion",
         // ie, just keep traversing the first arc with
         // NO_OUTPUT that we can find, since this must lead

diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java
@@ -39,6 +39,7 @@
 import org.apache.lucene.util.fst.PairOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.fst.Util.TopResults;
 
 import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
 
@@ -143,7 +144,8 @@ public void lookup(final CompletionScorer scorer, Bits acceptDocs, final TopSugg
     // have been collected, regardless of the set topN value. This value is the
     // maximum number of suggestions that can be collected.
     final int topN = collector.getCountToCollect() * prefixPaths.size();
-    final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered);
+    final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio,
+        scorer.filtered || collector.canReject());
 
     final CharsRefBuilder spare = new CharsRefBuilder();
 
@@ -200,23 +202,33 @@ protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
         if (!scorer.accept(docID, acceptDocs)) {
           return false;
         }
+        boolean duplicateSurfaceForm = false;
+        boolean collected = false;
+        char[] surfaceForm = null;
         if (collector.doSkipDuplicates()) {
           // now record that we've seen this surface form:
-          char[] key = new char[spare.length()];
-          System.arraycopy(spare.chars(), 0, key, 0, spare.length());
-          if (collector.seenSurfaceForms.contains(key)) {
+          surfaceForm = new char[spare.length()];
+          System.arraycopy(spare.chars(), 0, surfaceForm, 0, spare.length());
+          if (collector.seenSurfaceForms.contains(surfaceForm)) {
             // we already collected a higher scoring document with this key, in this segment:
-            return false;
+            duplicateSurfaceForm = true;
           }
-          collector.seenSurfaceForms.add(key);
         }
-        try {
-          float score = scorer.score(decode(path.output.output1), path.boost);
-          collector.collect(docID, spare.toCharsRef(), path.context, score);
-          return true;
-        } catch (IOException e) {
-          throw new RuntimeException(e);
+
+        // only try collecting if we didn't already detect a surface form duplicate and collector.doSkipDuplicates() == true
+        if (duplicateSurfaceForm == false) {
+          try {
+            float score = scorer.score(decode(path.output.output1), path.boost);
+            collected = collector.collect(docID, spare.toCharsRef(), path.context, score);
+            // also store the surface form as "seen" in case we are de-duplicating on surface forms in the collector also
+            if (collector.doSkipDuplicates() && collected) {
+              collector.seenSurfaceForms.add(surfaceForm);
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
         }
+        return collected;
       }
     };
 
@@ -239,7 +251,11 @@ protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
     }
     // hits are also returned by search()
     // we do not use it, instead collect at acceptResult
-    searcher.search();
+    TopResults<Pair<Long,BytesRef>> results = searcher.search();
+    if (results.isComplete == false) {
+      collector.setNotComplete();
+    }
+    return;
     // search admissibility is not guaranteed
     // see comment on getMaxTopNSearcherQueueSize
     // assert  search.isComplete;

diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java
@@ -20,6 +20,7 @@
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.util.fst.Util.TopNSearcher;
 
 /**
  * {@link org.apache.lucene.search.TopDocs} wrapper with
@@ -32,7 +33,8 @@ public class TopSuggestDocs extends TopDocs {
   /**
    * Singleton for empty {@link TopSuggestDocs}
    */
-  public final static TopSuggestDocs EMPTY = new TopSuggestDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new SuggestScoreDoc[0]);
+  public final static TopSuggestDocs EMPTY = new TopSuggestDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO),
+      new SuggestScoreDoc[0], true);
 
   /**
    * {@link org.apache.lucene.search.ScoreDoc} with an
@@ -88,13 +90,19 @@ public String toString() {
     }
   }
 
+  /**
+   * Indicates that all possibilities for completions were exhausted
+   */
+  final boolean isComplete;
+
   /**
    * {@link org.apache.lucene.search.TopDocs} wrapper with
    * {@link TopSuggestDocs.SuggestScoreDoc}
    * instead of {@link org.apache.lucene.search.ScoreDoc}
    */
-  public TopSuggestDocs(TotalHits totalHits, SuggestScoreDoc[] scoreDocs) {
+  public TopSuggestDocs(TotalHits totalHits, SuggestScoreDoc[] scoreDocs, boolean isComplete) {
     super(totalHits, scoreDocs);
+    this.isComplete = isComplete;
   }
 
   /**
@@ -116,19 +124,29 @@ public SuggestScoreDoc[] scoreLookupDocs() {
    */
   public static TopSuggestDocs merge(int topN, TopSuggestDocs[] shardHits) {
     SuggestScoreDocPriorityQueue priorityQueue = new SuggestScoreDocPriorityQueue(topN);
+    boolean allComplete = true;
     for (TopSuggestDocs shardHit : shardHits) {
       for (SuggestScoreDoc scoreDoc : shardHit.scoreLookupDocs()) {
         if (scoreDoc == priorityQueue.insertWithOverflow(scoreDoc)) {
           break;
         }
       }
+      allComplete &= shardHit.isComplete;
     }
     SuggestScoreDoc[] topNResults = priorityQueue.getResults();
     if (topNResults.length > 0) {
-      return new TopSuggestDocs(new TotalHits(topNResults.length, TotalHits.Relation.EQUAL_TO), topNResults);
+      return new TopSuggestDocs(new TotalHits(topNResults.length, TotalHits.Relation.EQUAL_TO), topNResults,
+          allComplete);
     } else {
       return TopSuggestDocs.EMPTY;
     }
   }
 
+  /**
+   * Indicates if the list of results is complete or not. Might be <code>false</code> if the {@link TopNSearcher} rejected
+   * too many of the queued results.
+   */
+  public boolean isComplete() {
+    return this.isComplete;
+  }
 }
diff --git a/...e/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java b/...e/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java
@@ -43,6 +43,8 @@
  * <p>
  * Subclasses should only override
  * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}.
+ * Overwriting subclasses can opt to reject documents, in which case
+ * they should return <tt>false</tt> to signal this back to the caller.
  * <p>
  * NOTE: {@link #setScorer(org.apache.lucene.search.Scorable)} and
  * {@link #collect(int)} is not used
@@ -63,6 +65,8 @@ public class TopSuggestDocsCollector extends SimpleCollector {
   /** Document base offset for the current Leaf */
   protected int docBase;
 
+  private boolean isComplete = true;
+
   /**
    * Sole constructor
    *
@@ -113,10 +117,14 @@ protected void doSetNextReader(LeafReaderContext context) throws IOException {
    * similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
    * but for completions.
    *
+   * This implementation always returns <tt>true</tt> because it collects all documents, but
+   * subclasses overwriting this can choose to reject documents in which case they should
+   * return <tt>false</tt> to signal this back to the caller.
+   *
    * NOTE: collection at the leaf level is guaranteed to be in
    * descending order of score
    */
-  public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
+  public boolean collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
     SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, context, score);
     if (current == priorityQueue.insertWithOverflow(current)) {
       // if the current SuggestScoreDoc has overflown from pq,
@@ -125,6 +133,7 @@ public void collect(int docID, CharSequence key, CharSequence context, float sco
       // TODO: reuse the overflow instance?
       throw new CollectionTerminatedException();
     }
+    return true;
   }
 
   /**
@@ -179,7 +188,8 @@ public TopSuggestDocs get() throws IOException {
     }
 
     if (suggestScoreDocs.length > 0) {
-      return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
+      return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO),
+          suggestScoreDocs, this.isComplete);
     } else {
       return TopSuggestDocs.EMPTY;
     }
@@ -201,4 +211,27 @@ public void collect(int doc) throws IOException {
   public ScoreMode scoreMode() {
     return ScoreMode.COMPLETE;
   }
+
+  /**
+   * returns true if the collector clearly exhausted all possibilities to collect results
+   */
+  boolean isComplete() {
+    return this.isComplete ;
+  }
+
+  /**
+   * call to signal that during collection at least one segment might have returned incomplete results, e.g. because
+   * of too many rejections
+   */
+  void setNotComplete() {
+    this.isComplete = false;
+  }
+
+  /**
+   * indicate if this collectors {@link #collect(int, CharSequence, CharSequence, float)} method potentially rejects
+   * documents. This information can be used to e.g. estimating necessary queue sizes in the searcher.
+   */
+  protected boolean canReject() {
+    return false;
+  }
 }