hunspell: speed up GeneratingSuggester by not deserializing non-sugge…

…stible roots
apache · Oct 18, 2022 · 73fe277 · 73fe277
1 parent 3adec5b
commit 73fe277
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 42 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -60,7 +60,7 @@ Improvements
 Optimizations
 ---------------------
 
-* GITHUB#11857: Hunspell: improved suggestion performance
+* GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance
 
 Bug Fixes
 ---------------------

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -49,6 +49,7 @@
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@@ -1131,7 +1132,8 @@ private WordStorage readSortedDictionaries(
 
     Map<String, Integer> morphIndices = new HashMap<>();
 
-    WordStorage.Builder builder = new WordStorage.Builder(wordCount, hasCustomMorphData, flags);
+    WordStorage.Builder builder =
+        new WordStorage.Builder(wordCount, hasCustomMorphData, flags, allNonSuggestibleFlags());
 
     try (ByteSequencesReader reader =
         new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
@@ -1197,6 +1199,13 @@ private WordStorage readSortedDictionaries(
     }
   }
 
+  char[] allNonSuggestibleFlags() {
+    return Dictionary.toSortedCharArray(
+        Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard)
+            .filter(c -> c != FLAG_UNSET)
+            .collect(Collectors.toSet()));
+  }
+
   private List<String> readMorphFields(String word, String unparsed) {
     List<String> morphFields = null;
     for (String datum : splitMorphData(unparsed)) {

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@@ -19,8 +19,6 @@
 import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
 import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
 import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD;
-import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
-import static org.apache.lucene.analysis.hunspell.Dictionary.HIDDEN_FLAG;
 
 import java.util.ArrayList;
 import java.util.Comparator;
@@ -31,7 +29,6 @@
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.stream.Collectors;
-import java.util.stream.Stream;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST;
 
@@ -73,19 +70,10 @@ char transformChar(char c) {
           }
         };
 
-    dictionary.words.processAllWords(
-        Math.max(1, word.length() - 4),
-        word.length() + 4,
+    dictionary.words.processSuggestibleWords(
+        Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
+        word.length() + MAX_ROOT_LENGTH_DIFF,
         (rootChars, forms) -> {
-          assert rootChars.length > 0;
-          if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) {
-            assert rootChars.length < word.length(); // processAllWords takes care of longer keys
-            return;
-          }
-
-          int suitable = filter.findSuitableFormIndex(forms, 0);
-          if (suitable < 0) return;
-
           if (ignoreTitleCaseRoots
               && Character.isUpperCase(rootChars.charAt(0))
               && WordCase.caseOf(rootChars) == WordCase.TITLE) {
@@ -99,13 +87,14 @@ char transformChar(char c) {
 
           sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
 
-          if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
+          if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) {
             return;
           }
 
           speller.checkCanceled.run();
 
           String root = rootChars.toString();
+          int suitable = filter.findSuitableFormIndex(forms, 0);
           do {
             roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
             suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
@@ -126,11 +115,7 @@ private static class EntryFilter {
     EntryFilter(Dictionary dic) {
       formStep = dic.formStep();
       flagLookup = dic.flagLookup;
-
-      Character[] flags = {HIDDEN_FLAG, dic.noSuggest, dic.forbiddenword, dic.onlyincompound};
-      excludeFlags =
-          Dictionary.toSortedCharArray(
-              Stream.of(flags).filter(c -> c != FLAG_UNSET).collect(Collectors.toSet()));
+      excludeFlags = dic.allNonSuggestibleFlags();
     }
 
     int findSuitableFormIndex(IntsRef forms, int start) {

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@@ -52,17 +52,19 @@ class WordStorage {
   private static final int OFFSET_BITS = 25;
   private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
   private static final int COLLISION_MASK = 0x40;
-  private static final int MAX_STORED_LENGTH = COLLISION_MASK - 1;
+  private static final int SUGGESTIBLE_MASK = 0x20;
+  private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
 
   /**
    * A map from word's hash (modulo array's length) into an int containing:
    *
    * <ul>
    *   <li>lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this
    *       hash
-   *   <li>the remaining highest bits: COLLISION+LENGTH info for that entry, i.e. one bit indicating
-   *       whether there are other entries with the same hash, and the length of the entry in chars,
-   *       or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits)
+   *   <li>the remaining highest bits: COLLISION+SUGGESTIBLE+LENGTH info for that entry, i.e. one
+   *       bit indicating whether there are other entries with the same hash, one bit indicating
+   *       whether this entry makes sense to be used in suggestions, and the length of the entry in
+   *       chars, or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits)
    * </ul>
    */
   private final int[] hashTable;
@@ -77,8 +79,8 @@ class WordStorage {
    *       single-character entries
    *   <li>(Optional, for hash-colliding entries only)
    *       <ul>
-   *         <li>BYTE: COLLISION+LENGTH info (see {@link #hashTable}) for the previous entry with
-   *             the same hash
+   *         <li>BYTE: COLLISION+SUGGESTIBLE+LENGTH info (see {@link #hashTable}) for the previous
+   *             entry with the same hash
    *         <li>VINT: (delta) pointer to the previous entry
    *       </ul>
    *   <li>(Optional, for non-leaf entries only) VINT+: word form data, returned from {@link
@@ -140,12 +142,18 @@ private static boolean hasCollision(int mask) {
     return (mask & COLLISION_MASK) != 0;
   }
 
+  private static boolean hasSuggestibleEntries(int mask) {
+    return (mask & SUGGESTIBLE_MASK) != 0;
+  }
+
   /**
    * Calls the processor for every dictionary entry with length between minLength and maxLength,
-   * both ends inclusive. Note that the callback arguments (word and forms) are reused, so they can
-   * be modified in any way, but may not be saved for later by the processor
+   * both ends inclusive, and at least one suggestible alternative (without NOSUGGEST, FORBIDDENWORD
+   * or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they
+   * can be modified in any way, but may not be saved for later by the processor
    */
-  void processAllWords(int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
+  void processSuggestibleWords(
+      int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
     assert minLength <= maxLength;
     CharsRef chars = new CharsRef(maxLength);
     IntsRef forms = new IntsRef();
@@ -162,7 +170,8 @@ void processAllWords(int minLength, int maxLength, BiConsumer<CharsRef, IntsRef>
         int prevPos = pos - in.readVInt();
 
         boolean last = !hasCollision(mask);
-        boolean mightMatch = hasLengthInRange(mask, minLength, maxLength);
+        boolean mightMatch =
+            hasSuggestibleEntries(mask) && hasLengthInRange(mask, minLength, maxLength);
 
         if (!last) {
           mask = in.readByte();
@@ -235,6 +244,7 @@ static class Builder {
     private final boolean hasCustomMorphData;
     private final int[] hashTable;
     private byte[] wordData;
+    private final char[] noSuggestFlags;
     private final int[] chainLengths;
 
     private final IntsRefBuilder currentOrds = new IntsRefBuilder();
@@ -253,10 +263,15 @@ static class Builder {
      *     pre-size the hash table. This argument can be a bit larger than the actual word count,
      *     but not smaller.
      */
-    Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) {
+    Builder(
+        int wordCount,
+        boolean hasCustomMorphData,
+        FlagEnumerator flagEnumerator,
+        char[] noSuggestFlags) {
       this.wordCount = wordCount;
       this.flagEnumerator = flagEnumerator;
       this.hasCustomMorphData = hasCustomMorphData;
+      this.noSuggestFlags = noSuggestFlags;
 
       hashTable = new int[wordCount];
       wordData = new byte[wordCount * 6];
@@ -317,15 +332,15 @@ private int flushGroup() throws IOException {
       currentOrds.clear();
       boolean hasNonHidden = false;
       for (char[] flags : group) {
-        if (!hasHiddenFlag(flags)) {
+        if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
           hasNonHidden = true;
           break;
         }
       }
 
       for (int i = 0; i < group.size(); i++) {
         char[] flags = group.get(i);
-        if (hasNonHidden && hasHiddenFlag(flags)) {
+        if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) {
           continue;
         }
 
@@ -353,7 +368,9 @@ private int flushGroup() throws IOException {
       int prevCode = hashTable[hash];
 
       int mask =
-          (prevCode == 0 ? 0 : COLLISION_MASK) | Math.min(currentEntry.length(), MAX_STORED_LENGTH);
+          (prevCode == 0 ? 0 : COLLISION_MASK)
+              | (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0)
+              | Math.min(currentEntry.length(), MAX_STORED_LENGTH);
       hashTable[hash] = (mask << OFFSET_BITS) | pos;
 
       if (++chainLengths[hash] > 20) {
@@ -375,9 +392,16 @@ private int flushGroup() throws IOException {
       return pos;
     }
 
-    private static boolean hasHiddenFlag(char[] flags) {
+    private boolean hasNoSuggestFlag(char[] flags) {
       for (char flag : flags) {
-        if (flag == Dictionary.HIDDEN_FLAG) {
+        if (hasFlag(noSuggestFlags, flag)) return true;
+      }
+      return false;
+    }
+
+    private static boolean hasFlag(char[] flags, char flag) {
+      for (char f : flags) {
+        if (f == flag) {
           return true;
         }
       }

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -86,11 +86,16 @@ public void testProcessAllWords() throws Exception {
     }
   }
 
+  public void testProcessSuggestibleWords() throws Exception {
+    Dictionary dictionary = loadDictionary("suggestible.aff", "suggestible.dic");
+
+    Set<String> processed = processSuggestibleWords(dictionary, 1, 100);
+    assertEquals(Set.of("normal", "ambiguous"), processed);
+  }
+
   private void checkProcessWords(
       Dictionary dictionary, Set<String> allWords, int minLength, int maxLength) {
-    Set<String> processed = new HashSet<>();
-    dictionary.words.processAllWords(
-        minLength, maxLength, (word, __) -> processed.add(word.toString()));
+    Set<String> processed = processSuggestibleWords(dictionary, minLength, maxLength);
 
     Set<String> filtered =
         allWords.stream()
@@ -100,6 +105,14 @@ private void checkProcessWords(
     assertEquals("For lengths [" + minLength + "," + maxLength + "]", filtered, processed);
   }
 
+  private static Set<String> processSuggestibleWords(
+      Dictionary dictionary, int minLength, int maxLength) {
+    Set<String> processed = new HashSet<>();
+    dictionary.words.processSuggestibleWords(
+        minLength, maxLength, (word, __) -> processed.add(word.toString()));
+    return processed;
+  }
+
   public void testCompressedDictionary() throws Exception {
     Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic");
     assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.aff
@@ -0,0 +1,4 @@
+ONLYINCOMPOUND O
+NOSUGGEST N
+FORBIDDENWORD F
+SUBSTANDARD S
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.dic
@@ -0,0 +1,8 @@
+1
+normal
+compound/O
+forbidden/F
+nosuggest/N
+substandard/S
+ambiguous
+ambiguous/N