From 73fe27712c17ab89a74fadd750c7d53e37bbef8e Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Tue, 18 Oct 2022 12:10:22 +0200 Subject: [PATCH] hunspell: speed up GeneratingSuggester by not deserializing non-suggestible roots --- lucene/CHANGES.txt | 2 +- .../lucene/analysis/hunspell/Dictionary.java | 11 +++- .../hunspell/GeneratingSuggester.java | 27 ++------- .../lucene/analysis/hunspell/WordStorage.java | 56 +++++++++++++------ .../analysis/hunspell/TestDictionary.java | 19 ++++++- .../lucene/analysis/hunspell/suggestible.aff | 4 ++ .../lucene/analysis/hunspell/suggestible.dic | 8 +++ 7 files changed, 85 insertions(+), 42 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.dic diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4c1fbb849609..1207b0df74a3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -60,7 +60,7 @@ Improvements Optimizations --------------------- -* GITHUB#11857: Hunspell: improved suggestion performance +* GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance Bug Fixes --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 09212794a8bf..e94047b67db3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -49,6 +49,7 @@ import java.util.Set; import java.util.TreeMap; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -1131,7 +1132,8 @@ private WordStorage readSortedDictionaries( Map morphIndices = new HashMap<>(); - WordStorage.Builder builder = new WordStorage.Builder(wordCount, hasCustomMorphData, flags); + WordStorage.Builder builder = + new WordStorage.Builder(wordCount, hasCustomMorphData, flags, allNonSuggestibleFlags()); try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) { @@ -1197,6 +1199,13 @@ private WordStorage readSortedDictionaries( } } + char[] allNonSuggestibleFlags() { + return Dictionary.toSortedCharArray( + Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard) + .filter(c -> c != FLAG_UNSET) + .collect(Collectors.toSet())); + } + private List readMorphFields(String word, String unparsed) { List morphFields = null; for (String datum : splitMorphData(unparsed)) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 6d999fffbf50..08b58f925f95 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -19,8 +19,6 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG; import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD; -import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET; -import static org.apache.lucene.analysis.hunspell.Dictionary.HIDDEN_FLAG; import java.util.ArrayList; import java.util.Comparator; @@ -31,7 +29,6 @@ import java.util.Set; import java.util.TreeSet; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.FST; @@ -73,19 +70,10 @@ char transformChar(char c) { } }; - dictionary.words.processAllWords( - Math.max(1, word.length() - 4), - word.length() + 4, + dictionary.words.processSuggestibleWords( + Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF), + word.length() + MAX_ROOT_LENGTH_DIFF, (rootChars, forms) -> { - assert rootChars.length > 0; - if (Math.abs(rootChars.length - word.length()) > MAX_ROOT_LENGTH_DIFF) { - assert rootChars.length < word.length(); // processAllWords takes care of longer keys - return; - } - - int suitable = filter.findSuitableFormIndex(forms, 0); - if (suitable < 0) return; - if (ignoreTitleCaseRoots && Character.isUpperCase(rootChars.charAt(0)) && WordCase.caseOf(rootChars) == WordCase.TITLE) { @@ -99,13 +87,14 @@ char transformChar(char c) { sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length); - if (roots.size() == MAX_ROOTS && sc < roots.peek().score) { + if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) { return; } speller.checkCanceled.run(); String root = rootChars.toString(); + int suitable = filter.findSuitableFormIndex(forms, 0); do { roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc)); suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep); @@ -126,11 +115,7 @@ private static class EntryFilter { EntryFilter(Dictionary dic) { formStep = dic.formStep(); flagLookup = dic.flagLookup; - - Character[] flags = {HIDDEN_FLAG, dic.noSuggest, dic.forbiddenword, dic.onlyincompound}; - excludeFlags = - Dictionary.toSortedCharArray( - Stream.of(flags).filter(c -> c != FLAG_UNSET).collect(Collectors.toSet())); + excludeFlags = dic.allNonSuggestibleFlags(); } int findSuitableFormIndex(IntsRef forms, int start) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java index 4d457e1f3068..b66428010b51 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java @@ -52,7 +52,8 @@ class WordStorage { private static final int OFFSET_BITS = 25; private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1; private static final int COLLISION_MASK = 0x40; - private static final int MAX_STORED_LENGTH = COLLISION_MASK - 1; + private static final int SUGGESTIBLE_MASK = 0x20; + private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1; /** * A map from word's hash (modulo array's length) into an int containing: @@ -60,9 +61,10 @@ class WordStorage { *
    *
  • lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this * hash - *
  • the remaining highest bits: COLLISION+LENGTH info for that entry, i.e. one bit indicating - * whether there are other entries with the same hash, and the length of the entry in chars, - * or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits) + *
  • the remaining highest bits: COLLISION+SUGGESTIBLE+LENGTH info for that entry, i.e. one + * bit indicating whether there are other entries with the same hash, one bit indicating + * whether this entry makes sense to be used in suggestions, and the length of the entry in + * chars, or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits) *
*/ private final int[] hashTable; @@ -77,8 +79,8 @@ class WordStorage { * single-character entries *
  • (Optional, for hash-colliding entries only) *
      - *
    • BYTE: COLLISION+LENGTH info (see {@link #hashTable}) for the previous entry with - * the same hash + *
    • BYTE: COLLISION+SUGGESTIBLE+LENGTH info (see {@link #hashTable}) for the previous + * entry with the same hash *
    • VINT: (delta) pointer to the previous entry *
    *
  • (Optional, for non-leaf entries only) VINT+: word form data, returned from {@link @@ -140,12 +142,18 @@ private static boolean hasCollision(int mask) { return (mask & COLLISION_MASK) != 0; } + private static boolean hasSuggestibleEntries(int mask) { + return (mask & SUGGESTIBLE_MASK) != 0; + } + /** * Calls the processor for every dictionary entry with length between minLength and maxLength, - * both ends inclusive. Note that the callback arguments (word and forms) are reused, so they can - * be modified in any way, but may not be saved for later by the processor + * both ends inclusive, and at least one suggestible alternative (without NOSUGGEST, FORBIDDENWORD + * or ONLYINCOMPOUND flags). Note that the callback arguments (word and forms) are reused, so they + * can be modified in any way, but may not be saved for later by the processor */ - void processAllWords(int minLength, int maxLength, BiConsumer processor) { + void processSuggestibleWords( + int minLength, int maxLength, BiConsumer processor) { assert minLength <= maxLength; CharsRef chars = new CharsRef(maxLength); IntsRef forms = new IntsRef(); @@ -162,7 +170,8 @@ void processAllWords(int minLength, int maxLength, BiConsumer int prevPos = pos - in.readVInt(); boolean last = !hasCollision(mask); - boolean mightMatch = hasLengthInRange(mask, minLength, maxLength); + boolean mightMatch = + hasSuggestibleEntries(mask) && hasLengthInRange(mask, minLength, maxLength); if (!last) { mask = in.readByte(); @@ -235,6 +244,7 @@ static class Builder { private final boolean hasCustomMorphData; private final int[] hashTable; private byte[] wordData; + private final char[] noSuggestFlags; private final int[] chainLengths; private final IntsRefBuilder currentOrds = new IntsRefBuilder(); @@ -253,10 +263,15 @@ static class Builder { * pre-size the hash table. This argument can be a bit larger than the actual word count, * but not smaller. */ - Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) { + Builder( + int wordCount, + boolean hasCustomMorphData, + FlagEnumerator flagEnumerator, + char[] noSuggestFlags) { this.wordCount = wordCount; this.flagEnumerator = flagEnumerator; this.hasCustomMorphData = hasCustomMorphData; + this.noSuggestFlags = noSuggestFlags; hashTable = new int[wordCount]; wordData = new byte[wordCount * 6]; @@ -317,7 +332,7 @@ private int flushGroup() throws IOException { currentOrds.clear(); boolean hasNonHidden = false; for (char[] flags : group) { - if (!hasHiddenFlag(flags)) { + if (!hasFlag(flags, Dictionary.HIDDEN_FLAG)) { hasNonHidden = true; break; } @@ -325,7 +340,7 @@ private int flushGroup() throws IOException { for (int i = 0; i < group.size(); i++) { char[] flags = group.get(i); - if (hasNonHidden && hasHiddenFlag(flags)) { + if (hasNonHidden && hasFlag(flags, Dictionary.HIDDEN_FLAG)) { continue; } @@ -353,7 +368,9 @@ private int flushGroup() throws IOException { int prevCode = hashTable[hash]; int mask = - (prevCode == 0 ? 0 : COLLISION_MASK) | Math.min(currentEntry.length(), MAX_STORED_LENGTH); + (prevCode == 0 ? 0 : COLLISION_MASK) + | (group.stream().anyMatch(flags -> !hasNoSuggestFlag(flags)) ? SUGGESTIBLE_MASK : 0) + | Math.min(currentEntry.length(), MAX_STORED_LENGTH); hashTable[hash] = (mask << OFFSET_BITS) | pos; if (++chainLengths[hash] > 20) { @@ -375,9 +392,16 @@ private int flushGroup() throws IOException { return pos; } - private static boolean hasHiddenFlag(char[] flags) { + private boolean hasNoSuggestFlag(char[] flags) { for (char flag : flags) { - if (flag == Dictionary.HIDDEN_FLAG) { + if (hasFlag(noSuggestFlags, flag)) return true; + } + return false; + } + + private static boolean hasFlag(char[] flags, char flag) { + for (char f : flags) { + if (f == flag) { return true; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index b18b2a4015e6..7b704d2e31c4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -86,11 +86,16 @@ public void testProcessAllWords() throws Exception { } } + public void testProcessSuggestibleWords() throws Exception { + Dictionary dictionary = loadDictionary("suggestible.aff", "suggestible.dic"); + + Set processed = processSuggestibleWords(dictionary, 1, 100); + assertEquals(Set.of("normal", "ambiguous"), processed); + } + private void checkProcessWords( Dictionary dictionary, Set allWords, int minLength, int maxLength) { - Set processed = new HashSet<>(); - dictionary.words.processAllWords( - minLength, maxLength, (word, __) -> processed.add(word.toString())); + Set processed = processSuggestibleWords(dictionary, minLength, maxLength); Set filtered = allWords.stream() @@ -100,6 +105,14 @@ private void checkProcessWords( assertEquals("For lengths [" + minLength + "," + maxLength + "]", filtered, processed); } + private static Set processSuggestibleWords( + Dictionary dictionary, int minLength, int maxLength) { + Set processed = new HashSet<>(); + dictionary.words.processSuggestibleWords( + minLength, maxLength, (word, __) -> processed.add(word.toString())); + return processed; + } + public void testCompressedDictionary() throws Exception { Dictionary dictionary = loadDictionary("compressed.aff", "compressed.dic"); assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.aff new file mode 100644 index 000000000000..c79bafcede5f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.aff @@ -0,0 +1,4 @@ +ONLYINCOMPOUND O +NOSUGGEST N +FORBIDDENWORD F +SUBSTANDARD S \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.dic new file mode 100644 index 000000000000..7c899f7a6787 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/suggestible.dic @@ -0,0 +1,8 @@ +1 +normal +compound/O +forbidden/F +nosuggest/N +substandard/S +ambiguous +ambiguous/N \ No newline at end of file