From 292a1ba07a4c42893acedc85c0755fe0e3167a76 Mon Sep 17 00:00:00 2001 From: Josh Edwards Date: Mon, 11 Apr 2016 12:19:28 -0400 Subject: [PATCH] SOLR-8010 Adding the ability to optionally enable breaking words where one side is a word but the other is not. Integrated with re-based trunk to re-submit the pull request. --- .../search/spell/WordBreakSpellChecker.java | 184 ++++++++++++++++-- .../spelling/WordBreakSolrSpellChecker.java | 11 ++ .../WordBreakSolrSpellCheckerTest.java | 129 +++++++++++- 3 files changed, 303 insertions(+), 21 deletions(-) diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java index 191f431bff48..6d466daf6d71 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java @@ -38,6 +38,7 @@ public class WordBreakSpellChecker { private int maxCombineWordLength = 20; private int maxChanges = 1; private int maxEvaluations = 1000; + private boolean isWordRequiredOnBothSidesOfBreak = true; /** Term that can be used to prohibit adjacent terms from being combined */ public static final Term SEPARATOR_TERM = new Term("", ""); @@ -103,8 +104,19 @@ public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions, } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; - Comparator queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator() - : new LengthThenSumFreqComparator(); + + Comparator queueComparator; + if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { + if (isWordRequiredOnBothSidesOfBreak) + queueComparator = new LengthThenMaxFreqComparator(); + else + queueComparator = new MostRealWordUseThenLengthAndMaxFreqComparator(); + } else { + if (isWordRequiredOnBothSidesOfBreak) + queueComparator = new LengthThenSumFreqComparator(); + else + queueComparator = new MostRealWordUseThenLengthAndSumFreqComparator(); + } Queue suggestions = new PriorityQueue<>( queueInitialCapacity, queueComparator); @@ -119,7 +131,7 @@ public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions, } generateBreakUpSuggestions(term, ir, 1, maxSuggestions, - useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0, + useMinSuggestionFrequency, new SuggestWord[0], new SuggestWord[0], suggestions, 0, sortMethod); SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][]; @@ -256,7 +268,7 @@ public CombineSuggestion[] suggestWordCombinations(Term[] terms, private int generateBreakUpSuggestions(Term term, IndexReader ir, int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency, - SuggestWord[] prefix, Queue suggestions, + SuggestWord[] prefix, SuggestWord[] suffix, Queue suggestions, int totalEvaluations, BreakSuggestionSortMethod sortMethod) throws IOException { String termText = term.text(); @@ -276,23 +288,77 @@ private int generateBreakUpSuggestions(Term term, IndexReader ir, String rightText = termText.substring(end); SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText); - if (leftWord.freq >= useMinSuggestionFrequency) { + if (leftWord.freq >= useMinSuggestionFrequency || !isWordRequiredOnBothSidesOfBreak) { SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText); - if (rightWord.freq >= useMinSuggestionFrequency) { + if (rightWord.freq >= useMinSuggestionFrequency || + (!isWordRequiredOnBothSidesOfBreak && leftWord.freq + rightWord.freq >= useMinSuggestionFrequency)) { + //This second if check is to make sure that there aren't consecutive non-words. It should never fail + //while isWordRequiredOnBothSidesOfBreak is set to true. + if ((leftWord.freq > 0 || prefix.length == 0 || prefix[prefix.length - 1].freq > 0) + && (rightWord.freq > 0 || suffix.length == 0 || suffix[0].freq > 0)) { + SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper( - newSuggestion(prefix, leftWord, rightWord)); + newSuggestion(prefix, leftWord, rightWord, suffix)); + //Duplicates are possible due to how we recurse if a "word" is not required on both sides, so if either both sides + //have to be a word, or there are no duplicates, then add it. We check first to see if a "word" is required on both + //sides so that we don't have to check the contains method, for efficiency. + if (isWordRequiredOnBothSidesOfBreak || !suggestions.contains(suggestion)) { suggestions.offer(suggestion); if (suggestions.size() > maxSuggestions) { suggestions.poll(); } + } + } } int newNumberBreaks = numberBreaks + 1; if (newNumberBreaks <= maxChanges) { - int evaluations = generateBreakUpSuggestions(new Term(term.field(), + SuggestWord[] newPrefix = newPrefix(prefix, leftWord); + SuggestWord[] newSuffix = newSuffix(rightWord, suffix); + + int consecutivePrefixesWithNoFrequency = 0; + for (SuggestWord word : newPrefix) { + if (word.freq == 0) + consecutivePrefixesWithNoFrequency++; + else + consecutivePrefixesWithNoFrequency = 0; + + if (consecutivePrefixesWithNoFrequency == 2) + break; + } + + int consecutiveSuffixesWithNoFrequency = 0; + for (SuggestWord word : newSuffix) { + if (word.freq == 0) + consecutiveSuffixesWithNoFrequency++; + else + consecutiveSuffixesWithNoFrequency = 0; + + if (consecutiveSuffixesWithNoFrequency == 2) + break; + } + + int evaluationsRight = 0; + int evaluationsLeft = 0; + + //If there are multiple non-words in a row, it is not a valid suggestion, so go no further. + if (consecutivePrefixesWithNoFrequency < 2 && consecutiveSuffixesWithNoFrequency < 2) { + evaluationsRight = generateBreakUpSuggestions(new Term(term.field(), rightWord.string), ir, newNumberBreaks, maxSuggestions, - useMinSuggestionFrequency, newPrefix(prefix, leftWord), + useMinSuggestionFrequency, newPrefix, suffix, + suggestions, totalEvaluations, sortMethod); + //If every break has to be a word, then you don't need to re-process both sides, but working from left to right + //is sufficient, so this step can be skipped for efficiency. + //In fact, if you leave this on, then you may get false positives, as we have not verified that rightWord.freq > 0. + if (!isWordRequiredOnBothSidesOfBreak) { + evaluationsLeft = generateBreakUpSuggestions(new Term(term.field(), + leftWord.string), ir, newNumberBreaks, maxSuggestions, + useMinSuggestionFrequency, prefix, newSuffix, suggestions, totalEvaluations, sortMethod); - totalEvaluations += evaluations; + } + } + + totalEvaluations += evaluationsRight + evaluationsLeft; + } } @@ -312,10 +378,17 @@ private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) { return newPrefix; } + private SuggestWord[] newSuffix(SuggestWord append, SuggestWord[] oldSuffix) { + SuggestWord[] newSuffix = new SuggestWord[oldSuffix.length + 1]; + System.arraycopy(oldSuffix, 0, newSuffix, 1, oldSuffix.length); + newSuffix[0] = append; + return newSuffix; + } + private SuggestWord[] newSuggestion(SuggestWord[] prefix, - SuggestWord append1, SuggestWord append2) { - SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2]; - int score = prefix.length + 1; + SuggestWord append1, SuggestWord append2, SuggestWord[] suffix) { + SuggestWord[] newSuggestion = new SuggestWord[prefix.length + suffix.length + 2]; + int score = prefix.length + suffix.length + 1; for (int i = 0; i < prefix.length; i++) { SuggestWord word = new SuggestWord(); word.string = prefix[i].string; @@ -325,8 +398,15 @@ private SuggestWord[] newSuggestion(SuggestWord[] prefix, } append1.score = score; append2.score = score; - newSuggestion[newSuggestion.length - 2] = append1; - newSuggestion[newSuggestion.length - 1] = append2; + newSuggestion[prefix.length] = append1; + newSuggestion[prefix.length + 1] = append2; + for (int i = 0; i < suffix.length; i++) { + SuggestWord word = new SuggestWord(); + word.string = suffix[i].string; + word.freq = suffix[i].freq; + word.score = score; + newSuggestion[i + prefix.length + 2] = word; + } return newSuggestion; } @@ -381,6 +461,14 @@ public int getMaxEvaluations() { return maxEvaluations; } + /** + * Returns whether or not a word is required on both sides of the suggested break. + * @see #setIsWordRequiredOnBothSidesOfBreak(boolean) + */ + public boolean getIsWordRequiredOnBothSidesOfBreak() { + return isWordRequiredOnBothSidesOfBreak; + } + /** *

* The minimum frequency a term must have to be included as part of a @@ -442,6 +530,51 @@ public void setMaxEvaluations(int maxEvaluations) { this.maxEvaluations = maxEvaluations; } + /** + *

+ * Whether or not both sides of the suggested break have to be "words." Defaults to true. + *

+ * + * @see #getIsWordRequiredOnBothSidesOfBreak() + */ + public void setIsWordRequiredOnBothSidesOfBreak(boolean isWordRequiredOnBothSidesOfBreak) { + this.isWordRequiredOnBothSidesOfBreak = isWordRequiredOnBothSidesOfBreak; + } + + private class MostRealWordUseThenLengthAndMaxFreqComparator implements + Comparator { + @Override + public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) { + if (o1.realWordCharacterCount != o2.realWordCharacterCount) + return o1.realWordCharacterCount - o2.realWordCharacterCount; + + if (o1.suggestWords.length != o2.suggestWords.length) { + return o2.suggestWords.length - o1.suggestWords.length; + } + if (o1.freqMax != o2.freqMax) { + return o1.freqMax - o2.freqMax; + } + return 0; + } + } + + private class MostRealWordUseThenLengthAndSumFreqComparator implements + Comparator { + @Override + public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) { + if (o1.realWordCharacterCount != o2.realWordCharacterCount) + return o1.realWordCharacterCount - o2.realWordCharacterCount; + + if (o1.suggestWords.length != o2.suggestWords.length) { + return o2.suggestWords.length - o1.suggestWords.length; + } + if (o1.freqSum != o2.freqSum) { + return o1.freqSum - o2.freqSum; + } + return 0; + } +} + private class LengthThenMaxFreqComparator implements Comparator { @Override @@ -489,17 +622,38 @@ private class SuggestWordArrayWrapper { final SuggestWord[] suggestWords; final int freqMax; final int freqSum; + final int realWordCharacterCount; SuggestWordArrayWrapper(SuggestWord[] suggestWords) { this.suggestWords = suggestWords; int aFreqSum = 0; int aFreqMax = 0; + int aRealWordCharacterCount = 0; for (SuggestWord sw : suggestWords) { aFreqSum += sw.freq; aFreqMax = Math.max(aFreqMax, sw.freq); + if (sw.freq > 0) + aRealWordCharacterCount += sw.string.length(); } this.freqSum = aFreqSum; this.freqMax = aFreqMax; + this.realWordCharacterCount = aRealWordCharacterCount; + } + + public boolean equals(Object obj) { + if (obj == null || !obj.getClass().equals(this.getClass())) + return false; + + SuggestWordArrayWrapper other = (SuggestWordArrayWrapper)obj; + if (this.suggestWords == null || other.suggestWords == null || this.suggestWords.length != other.suggestWords.length) + return false; + + for (int i=0; i * Specify a value on the "breakSugestionTieBreaker" parameter. @@ -158,6 +163,12 @@ public String init(@SuppressWarnings("unchecked") NamedList config, if (msf > 0) { wbsp.setMinSuggestionFrequency(msf); } + boolean br = boolParam(config, PARAM_IS_WORD_REQUIRED_ON_BOTH_SIDES_OF_BREAK); + //default to false if not populated, instead of true. + String strParam = strParam(config, PARAM_IS_WORD_REQUIRED_ON_BOTH_SIDES_OF_BREAK); + if (strParam != null) { + wbsp.setIsWordRequiredOnBothSidesOfBreak(br); + } return name; } diff --git a/solr/core/src/test/org/apache/solr/spelling/WordBreakSolrSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/WordBreakSolrSpellCheckerTest.java index 92e06bb2403f..63e0388372f1 100644 --- a/solr/core/src/test/org/apache/solr/spelling/WordBreakSolrSpellCheckerTest.java +++ b/solr/core/src/test/org/apache/solr/spelling/WordBreakSolrSpellCheckerTest.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.Token; +import org.apache.lucene.search.spell.WordBreakSpellChecker; import org.apache.lucene.util.LuceneTestCase.SuppressTempFileChecks; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.util.NamedList; @@ -129,12 +130,6 @@ public void testStandAlone() throws Exception { assertTrue(orig.length()==4); assertTrue(corr.length==1); assertTrue(corr[0].equals("pi ne")); - } else if(orig.toString().equals("pine")) { - assertTrue(orig.startOffset()==10); - assertTrue(orig.endOffset()==14); - assertTrue(orig.length()==4); - assertTrue(corr.length==1); - assertTrue(corr[0].equals("pi ne")); } else if(orig.toString().equals("apple")) { assertTrue(orig.startOffset()==15); assertTrue(orig.endOffset()==20); @@ -156,6 +151,128 @@ public void testStandAlone() throws Exception { } } @Test + public void testStandAloneBreakWhenNotBothWords() throws Exception { + SolrCore core = h.getCore(); + WordBreakSolrSpellChecker checker = new WordBreakSolrSpellChecker(); + NamedList params = new NamedList<>(); + params.add("field", "lowerfilt"); + params.add(WordBreakSolrSpellChecker.PARAM_BREAK_WORDS, "true"); + params.add(WordBreakSolrSpellChecker.PARAM_COMBINE_WORDS, "true"); + params.add(WordBreakSolrSpellChecker.PARAM_MAX_CHANGES, "10"); + params.add(WordBreakSolrSpellChecker.PARAM_IS_WORD_REQUIRED_ON_BOTH_SIDES_OF_BREAK, "false"); + checker.init(params, core); + + RefCounted searcher = core.getSearcher(); + QueryConverter qc = new SpellingQueryConverter(); + qc.setAnalyzer(new MockAnalyzer(random())); + Collection tokens = qc.convert("paintable pine apple good ness grampa miss grampabill mypaintablefoods"); + SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.get().getIndexReader(), 10); + SpellingResult result = checker.getSuggestions(spellOpts); + searcher.decref(); + + assertTrue(result != null && result.getSuggestions() != null); + assertTrue(result.getSuggestions().size()==13); + + for(Map.Entry> s : result.getSuggestions().entrySet()) { + Token orig = s.getKey(); + String[] corr = s.getValue().keySet().toArray(new String[0]); + if(orig.toString().equals("paintable")) { + assertTrue(orig.startOffset()==0); + assertTrue(orig.endOffset()==9); + assertTrue(orig.length()==9); + assertTrue(corr.length==7); + assertTrue(corr[0].equals("paint able")); //everything in "real" words; 1 break ; max doc freq=5 + assertTrue(corr[1].equals("pain table")); //everything in "real" words; 1 break ; max doc freq=2 + assertTrue(corr[2].equals("pa in table")); //everything in "real" words; 2 breaks + assertTrue(corr[3].equals("pain t able")); //8 out of 9 in "real" words; 2 breaks + assertTrue(corr[4].equals("pa in t able"));//8 out of 9 in "real" words; 3 breaks + assertTrue(corr[5].equals("pa int able")); //6 out of 9 in "real" words; 2 breaks + assertTrue(corr[6].equals("pa intable")); //2 out of 9 in "real" words; 1 break + } else if(orig.toString().equals("pine apple")) { + assertTrue(orig.startOffset()==10); + assertTrue(orig.endOffset()==20); + assertTrue(orig.length()==10); + assertTrue(corr.length==1); + assertTrue(corr[0].equals("pineapple")); + } else if(orig.toString().equals("paintable pine")) { + assertTrue(orig.startOffset()==0); + assertTrue(orig.endOffset()==14); + assertTrue(orig.length()==14); + assertTrue(corr.length==1); + assertTrue(corr[0].equals("paintablepine")); + } else if(orig.toString().equals("good ness")) { + assertTrue(orig.startOffset()==21); + assertTrue(orig.endOffset()==30); + assertTrue(orig.length()==9); + assertTrue(corr.length==1); + assertTrue(corr[0].equals("goodness")); + } else if(orig.toString().equals("pine apple good ness")) { + assertTrue(orig.startOffset()==10); + assertTrue(orig.endOffset()==30); + assertTrue(orig.length()==20); + assertTrue(corr.length==1); + assertTrue(corr[0].equals("pineapplegoodness")); + } else if(orig.toString().equals("pine")) { + assertTrue(orig.startOffset()==10); + assertTrue(orig.endOffset()==14); + assertTrue(orig.length()==4); + assertTrue(corr.length==2); + assertTrue(corr[0].equals("pi ne")); + assertTrue(corr[1].equals("p in e")); + } else if(orig.toString().equals("ness")) { + assertTrue(orig.startOffset()==26); + assertTrue(orig.endOffset()==30); + assertTrue(orig.length()==4); + assertTrue(corr.length==1); + assertTrue(corr[0].equals("ne ss")); + } else if(orig.toString().equals("grampa")) { + assertTrue(orig.startOffset()==31); + assertTrue(orig.endOffset()==37); + assertTrue(orig.length()==6); + assertTrue(corr.length==1); + assertTrue(corr[0].equals("gram pa")); + } else if(orig.toString().equals("miss")) { + assertTrue(orig.startOffset()==38); + assertTrue(orig.endOffset()==42); + assertTrue(orig.length()==4); + assertTrue(corr.length==0); + } else if(orig.toString().equals("apple")) { + assertTrue(orig.startOffset()==15); + assertTrue(orig.endOffset()==20); + assertTrue(orig.length()==5); + assertTrue(corr.length==0); + } else if(orig.toString().equals("good")) { + assertTrue(orig.startOffset()==21); + assertTrue(orig.endOffset()==25); + assertTrue(orig.length()==4); + assertTrue(corr.length==0); + } else if(orig.toString().equals("grampabill")) { + assertTrue(orig.startOffset()==43); + assertTrue(orig.endOffset()==53); + assertTrue(orig.length()==10); + assertTrue(corr.length==1); + assertTrue(corr[0].equals("gram pa bill")); + } else if(orig.toString().equals("mypaintablefoods")) { + assertTrue(orig.startOffset()==54); + assertTrue(orig.endOffset()==70); + assertTrue(orig.length()==16); + assertTrue(corr.length==10); //it maxes out on suggestions + assertTrue(corr[0].equals("my paint able food s")); //13 out of 16 in "real" words; 4 breaks; 5 max frequency + assertTrue(corr[1].equals("my pain table food s")); //13 out of 16 in "real" words; 4 breaks; 2 max frequency + assertTrue(corr[2].equals("my pa in table food s")); //13 out of 16 in "real" words; 5 breaks; 7 max frequency + assertTrue(corr[3].equals("my pain t able food s")); //12 out of 16 in "real" words; 5 breaks; 5 max frequency + assertTrue(corr[4].equals("my pa in t able food s")); //12 out of 16 in "real" words; 6 breaks; 7 max frequency + assertTrue(corr[5].equals("mypa in table food s")); //11 out of 16 in "real" words; 4 breaks; 7 max frequency + assertTrue(corr[6].equals("mypa in t able food s")); //10 out of 16 in "real" words; 5 breaks; 7 max frequency + assertTrue(corr[7].equals("my pa int able food s")); //10 out of 16 in "real" words; 5 breaks; 5 max frequency + assertTrue(corr[8].equals("my paint able foods")); //9 out of 16 in "real" words; 3 breaks; 5 max frequency + assertTrue(corr[9].equals("mypain table food s")); //9 out of 16 in "real" words; 3 breaks; 2 max frequency + }else { + fail("Unexpected original result: " + orig); + } + } + } + @Test public void testInConjunction() throws Exception { assertQ(req( "q", "lowerfilt:(paintable pine apple good ness)",