apache · cbuescher · Sep 11, 2019 · Oct 2, 2019 · Oct 2, 2019 · Oct 2, 2019
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@@ -460,11 +460,6 @@ public TopResults<T> search() throws IOException {
           continue;
         }
 
-        if (results.size() == topN-1 && maxQueueDepth == topN) {
-          // Last path -- don't bother w/ queue anymore:
-          queue = null;
-        }
-
         // We take path and find its "0 output completion",
         // ie, just keep traversing the first arc with
         // NO_OUTPUT that we can find, since this must lead

diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java
@@ -39,6 +39,7 @@
 import org.apache.lucene.util.fst.PairOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.fst.Util.TopResults;
 
 import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
 
@@ -200,23 +201,30 @@ protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
         if (!scorer.accept(docID, acceptDocs)) {
           return false;
         }
+        boolean duplicateSurfaceForm = false;
+        boolean collected = false;
         if (collector.doSkipDuplicates()) {
           // now record that we've seen this surface form:
           char[] key = new char[spare.length()];
           System.arraycopy(spare.chars(), 0, key, 0, spare.length());
           if (collector.seenSurfaceForms.contains(key)) {
             // we already collected a higher scoring document with this key, in this segment:
-            return false;
+            duplicateSurfaceForm = true;
+          } else {
+            collector.seenSurfaceForms.add(key);
           }
-          collector.seenSurfaceForms.add(key);
         }
-        try {
-          float score = scorer.score(decode(path.output.output1), path.boost);
-          collector.collect(docID, spare.toCharsRef(), path.context, score);
-          return true;
-        } catch (IOException e) {
-          throw new RuntimeException(e);
+
+        // only try collecting if we didn't already detect a surface form duplicate and collector.doSkipDuplicates() == true
+        if (duplicateSurfaceForm == false) {
+          try {
+            float score = scorer.score(decode(path.output.output1), path.boost);
+            collected = collector.collect(docID, spare.toCharsRef(), path.context, score);
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
         }
+        return collected;
       }
     };
 
@@ -239,7 +247,11 @@ protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
     }
     // hits are also returned by search()
     // we do not use it, instead collect at acceptResult
-    searcher.search();
+    TopResults<Pair<Long,BytesRef>> results = searcher.search();
+    if (results.isComplete == false) {
+      collector.notComplete();
+    }
+    return;
     // search admissibility is not guaranteed
     // see comment on getMaxTopNSearcherQueueSize
     // assert  search.isComplete;

diff --git a/...e/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java b/...e/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java
@@ -63,6 +63,8 @@ public class TopSuggestDocsCollector extends SimpleCollector {
   /** Document base offset for the current Leaf */
   protected int docBase;
 
+  private boolean isComplete = true;
+
   /**
    * Sole constructor
    *
@@ -116,7 +118,7 @@ protected void doSetNextReader(LeafReaderContext context) throws IOException {
    * NOTE: collection at the leaf level is guaranteed to be in
    * descending order of score
    */
-  public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
+  public boolean collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
     SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, context, score);
     if (current == priorityQueue.insertWithOverflow(current)) {
       // if the current SuggestScoreDoc has overflown from pq,
@@ -125,6 +127,7 @@ public void collect(int docID, CharSequence key, CharSequence context, float sco
       // TODO: reuse the overflow instance?
       throw new CollectionTerminatedException();
     }
+    return true;
   }
 
   /**
@@ -201,4 +204,19 @@ public void collect(int doc) throws IOException {
   public ScoreMode scoreMode() {
     return ScoreMode.COMPLETE;
   }
+
+  /**
+   * returns true if the collector clearly exhausted all possibilities to collect results
+   */
+  public boolean isComplete() {
+    return this.isComplete ;
+  }
+
+  /**
+   * call to signal that during collection at least one segment might have returned incomplete results, e.g. because
+   * of too many rejections
+   */
+  void notComplete() {
+    this.isComplete = false;
+  }
 }
diff --git a/...suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java b/...suggest/src/test/org/apache/lucene/search/suggest/document/TestPrefixCompletionQuery.java
@@ -17,10 +17,15 @@
 package org.apache.lucene.search.suggest.document;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
 import java.util.Objects;
+import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockSynonymAnalyzer;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
@@ -253,6 +258,61 @@ public void testDocFiltering() throws Exception {
     iw.close();
   }
 
+  /**
+   * Test that the correct amount of documents are collected if using a collector that also rejects documents.
+   */
+  public void testCollectorThatRejects() throws Exception {
+    // use synonym analyzer to have multiple paths to same suggested document. This mock adds "dog" as synonym for "dogs"
+    Analyzer analyzer = new MockSynonymAnalyzer();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
+    List<Entry> expectedResults = new ArrayList<Entry>();
+
+    for (int docCount = 10; docCount > 0; docCount--) {
+      Document document = new Document();
+      String value = "ab" + docCount + " dogs";
+      document.add(new SuggestField("suggest_field", value, docCount));
+      expectedResults.add(new Entry(value, docCount));
+      iw.addDocument(document);
+    }
+
+    if (rarely()) {
+      iw.commit();
+    }
+
+    DirectoryReader reader = iw.getReader();
+    SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader);
+
+    PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "ab"));
+    int topN = 5;
+
+    // use a TopSuggestDocsCollector that rejects results with duplicate docIds
+    TopSuggestDocsCollector collector = new TopSuggestDocsCollector(topN, false) {
+
+      private Set<Integer> seenDocIds = new HashSet<>();
+
+      @Override
+      public boolean collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
+          int globalDocId = docID + docBase;
+          boolean collected = false;
+          if (seenDocIds.contains(globalDocId) == false) {
+              super.collect(docID, key, context, score);
+              seenDocIds.add(globalDocId);
+              collected = true;
+          }
+          return collected;
+      }
+    };
+
+    indexSearcher.suggest(query, collector);
+    assertSuggestions(collector.get(), expectedResults.subList(0, topN).toArray(new Entry[0]));
+
+    // TODO expecting true here, why false?
+    assertFalse(collector.isComplete());
+
+    reader.close();
+    iw.close();
+  }
+
   public void testAnalyzerDefaults() throws Exception {
     Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
     CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);