From 871c6d24bca90e32a3c5dc3de54dd48d6229ffc7 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Mon, 7 Nov 2016 15:36:41 -0500 Subject: [PATCH 1/2] LUCENE-7544 - add UnifiedHighlighter extension points for custom queries --- .../uhighlight/AnalysisOffsetStrategy.java | 15 +- .../uhighlight/MultiTermHighlighting.java | 236 ++++++++++-------- .../search/uhighlight/PhraseHelper.java | 19 +- .../search/uhighlight/UnifiedHighlighter.java | 35 ++- .../uhighlight/TestUnifiedHighlighter.java | 82 ++++++ .../uhighlight/TestUnifiedHighlighterMTQ.java | 90 +++++++ 6 files changed, 357 insertions(+), 120 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java index 553a636ed6af..6b4cc74a48e2 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java @@ -19,8 +19,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.function.Function; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.FilteringTokenFilter; @@ -30,6 +32,7 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Terms; import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; @@ -50,7 +53,9 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { private final LeafReader leafReader; private final CharacterRunAutomaton preMemIndexFilterAutomaton; - public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) { + public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper, + CharacterRunAutomaton[] automata, Analyzer analyzer, + Function> multiTermQueryRewrite) { super(field, extractedTerms, phraseHelper, automata); this.analyzer = analyzer; // Automata (Wildcards / MultiTermQuery): @@ -68,7 +73,8 @@ public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHel memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // preFilter for MemoryIndex - preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases); + preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases, + multiTermQueryRewrite); } else { memoryIndex = null; leafReader = null; @@ -155,7 +161,8 @@ protected boolean accept() throws IOException { */ private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms, CharacterRunAutomaton[] automata, - PhraseHelper strictPhrases) { + PhraseHelper strictPhrases, + Function> multiTermQueryRewrite) { List allAutomata = new ArrayList<>(); if (terms.length > 0) { allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms)))); @@ -163,7 +170,7 @@ private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesR Collections.addAll(allAutomata, automata); for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) { Collections.addAll(allAutomata, - MultiTermHighlighting.extractAutomata(spanQuery, field, true));//true==lookInSpan + MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan } if (allAutomata.size() == 1) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java index 9498af584a85..8bf55c47ad43 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java @@ -20,8 +20,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Comparator; import java.util.List; +import java.util.function.Function; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -69,130 +71,142 @@ private MultiTermHighlighting() { * Extracts all MultiTermQueries for {@code field}, and returns equivalent * automata that will match terms. */ - public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan) { + public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan, + Function> customAutomataExtraction) { List list = new ArrayList<>(); - if (query instanceof BooleanQuery) { - for (BooleanClause clause : (BooleanQuery) query) { - if (!clause.isProhibited()) { - list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan))); - } - } - } else if (query instanceof ConstantScoreQuery) { - list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan))); - } else if (query instanceof DisjunctionMaxQuery) { - for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); - } - } else if (lookInSpan && query instanceof SpanOrQuery) { - for (Query sub : ((SpanOrQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); - } - } else if (lookInSpan && query instanceof SpanNearQuery) { - for (Query sub : ((SpanNearQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); + Collection customSubQueries = customAutomataExtraction.apply(query); + if (customSubQueries != null) { + for (Query sub : customSubQueries) { + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, customAutomataExtraction))); } - } else if (lookInSpan && query instanceof SpanNotQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan))); - } else if (lookInSpan && query instanceof SpanPositionCheckQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan))); - } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) { - list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field, lookInSpan))); - } else if (query instanceof AutomatonQuery) { - final AutomatonQuery aq = (AutomatonQuery) query; - if (aq.getField().equals(field)) { - list.add(new CharacterRunAutomaton(aq.getAutomaton()) { - @Override - public String toString() { - return aq.toString(); - } - }); - } - } else if (query instanceof PrefixQuery) { - final PrefixQuery pq = (PrefixQuery) query; - Term prefix = pq.getPrefix(); - if (prefix.field().equals(field)) { - list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()), - Automata.makeAnyString())) { - @Override - public String toString() { - return pq.toString(); + } else { + if (query instanceof BooleanQuery) { + for (BooleanClause clause : (BooleanQuery) query) { + if (!clause.isProhibited()) { + list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan, customAutomataExtraction))); } - }); - } - } else if (query instanceof FuzzyQuery) { - final FuzzyQuery fq = (FuzzyQuery) query; - if (fq.getField().equals(field)) { - String utf16 = fq.getTerm().text(); - int termText[] = new int[utf16.codePointCount(0, utf16.length())]; - for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { - termText[j++] = cp = utf16.codePointAt(i); } - int termLength = termText.length; - int prefixLength = Math.min(fq.getPrefixLength(), termLength); - String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength); - LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions()); - String prefix = UnicodeUtil.newString(termText, 0, prefixLength); - Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix); - list.add(new CharacterRunAutomaton(automaton) { - @Override - public String toString() { - return fq.toString(); - } - }); - } - } else if (query instanceof TermRangeQuery) { - final TermRangeQuery tq = (TermRangeQuery) query; - if (tq.getField().equals(field)) { - final CharsRef lowerBound; - if (tq.getLowerTerm() == null) { - lowerBound = null; - } else { - lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString()); + } else if (query instanceof ConstantScoreQuery) { + list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan, + customAutomataExtraction))); + } else if (query instanceof DisjunctionMaxQuery) { + for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, customAutomataExtraction))); } - - final CharsRef upperBound; - if (tq.getUpperTerm() == null) { - upperBound = null; - } else { - upperBound = new CharsRef(tq.getUpperTerm().utf8ToString()); + } else if (lookInSpan && query instanceof SpanOrQuery) { + for (Query sub : ((SpanOrQuery) query).getClauses()) { + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, customAutomataExtraction))); } + } else if (lookInSpan && query instanceof SpanNearQuery) { + for (Query sub : ((SpanNearQuery) query).getClauses()) { + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, customAutomataExtraction))); + } + } else if (lookInSpan && query instanceof SpanNotQuery) { + list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan, + customAutomataExtraction))); + } else if (lookInSpan && query instanceof SpanPositionCheckQuery) { + list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan, + customAutomataExtraction))); + } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) { + list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field, + lookInSpan, customAutomataExtraction))); + } else if (query instanceof AutomatonQuery) { + final AutomatonQuery aq = (AutomatonQuery) query; + if (aq.getField().equals(field)) { + list.add(new CharacterRunAutomaton(aq.getAutomaton()) { + @Override + public String toString() { + return aq.toString(); + } + }); + } + } else if (query instanceof PrefixQuery) { + final PrefixQuery pq = (PrefixQuery) query; + Term prefix = pq.getPrefix(); + if (prefix.field().equals(field)) { + list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()), + Automata.makeAnyString())) { + @Override + public String toString() { + return pq.toString(); + } + }); + } + } else if (query instanceof FuzzyQuery) { + final FuzzyQuery fq = (FuzzyQuery) query; + if (fq.getField().equals(field)) { + String utf16 = fq.getTerm().text(); + int termText[] = new int[utf16.codePointCount(0, utf16.length())]; + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { + termText[j++] = cp = utf16.codePointAt(i); + } + int termLength = termText.length; + int prefixLength = Math.min(fq.getPrefixLength(), termLength); + String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength); + LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions()); + String prefix = UnicodeUtil.newString(termText, 0, prefixLength); + Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix); + list.add(new CharacterRunAutomaton(automaton) { + @Override + public String toString() { + return fq.toString(); + } + }); + } + } else if (query instanceof TermRangeQuery) { + final TermRangeQuery tq = (TermRangeQuery) query; + if (tq.getField().equals(field)) { + final CharsRef lowerBound; + if (tq.getLowerTerm() == null) { + lowerBound = null; + } else { + lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString()); + } - final boolean includeLower = tq.includesLower(); - final boolean includeUpper = tq.includesUpper(); - final CharsRef scratch = new CharsRef(); - - @SuppressWarnings("deprecation") - final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); + final CharsRef upperBound; + if (tq.getUpperTerm() == null) { + upperBound = null; + } else { + upperBound = new CharsRef(tq.getUpperTerm().utf8ToString()); + } - // this is *not* an automaton, but its very simple - list.add(new CharacterRunAutomaton(Automata.makeEmpty()) { - @Override - public boolean run(char[] s, int offset, int length) { - scratch.chars = s; - scratch.offset = offset; - scratch.length = length; - - if (lowerBound != null) { - int cmp = comparator.compare(scratch, lowerBound); - if (cmp < 0 || (!includeLower && cmp == 0)) { - return false; + final boolean includeLower = tq.includesLower(); + final boolean includeUpper = tq.includesUpper(); + final CharsRef scratch = new CharsRef(); + + @SuppressWarnings("deprecation") + final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); + + // this is *not* an automaton, but its very simple + list.add(new CharacterRunAutomaton(Automata.makeEmpty()) { + @Override + public boolean run(char[] s, int offset, int length) { + scratch.chars = s; + scratch.offset = offset; + scratch.length = length; + + if (lowerBound != null) { + int cmp = comparator.compare(scratch, lowerBound); + if (cmp < 0 || (!includeLower && cmp == 0)) { + return false; + } } - } - if (upperBound != null) { - int cmp = comparator.compare(scratch, upperBound); - if (cmp > 0 || (!includeUpper && cmp == 0)) { - return false; + if (upperBound != null) { + int cmp = comparator.compare(scratch, upperBound); + if (cmp > 0 || (!includeUpper && cmp == 0)) { + return false; + } } + return true; } - return true; - } - @Override - public String toString() { - return tq.toString(); - } - }); + @Override + public String toString() { + return tq.toString(); + } + }); + } } } return list.toArray(new CharacterRunAutomaton[list.size()]); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java index 5225041f9bec..95d51c917da4 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java @@ -40,7 +40,7 @@ public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", - spanQuery -> null, true); + spanQuery -> null, query -> null, true); //TODO it seems this ought to be a general thing on Spans? private static final Comparator SPANS_COMPARATOR = (o1, o2) -> { @@ -69,11 +69,14 @@ public class PhraseHelper { * {@code rewriteQueryPred} is an extension hook to override the default choice of * {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types are rewritten, * so use this to return {@link Boolean#FALSE} if you know the query doesn't need to be rewritten. + * Similarly, {@code preExtractRewriteFunction} is also an extension hook for extract to allow different queries + * to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked. * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones. */ public PhraseHelper(Query query, String field, Function rewriteQueryPred, - boolean ignoreQueriesNeedingRewrite) { + Function> preExtractRewriteFunction, + boolean ignoreQueriesNeedingRewrite) { this.fieldName = field; // if null then don't require field match // filter terms to those we want positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>(); @@ -98,6 +101,18 @@ public PhraseHelper(Query query, String field, Function rewr } } + @Override + protected void extract(Query query, float boost, Map terms) throws IOException { + Collection newQueriesToExtract = preExtractRewriteFunction.apply(query); + if (newQueriesToExtract != null) { + for (Query newQuery : newQueriesToExtract) { + extract(newQuery, boost, terms); + } + } else { + super.extract(query, boost, terms); + } + } + @Override protected boolean isQueryUnsupported(Class clazz) { if (clazz.isAssignableFrom(MultiTermQuery.class)) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index 72be180c177a..cb5605c564d7 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -21,6 +21,7 @@ import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.EnumSet; import java.util.HashMap; import java.util.List; @@ -732,7 +733,8 @@ protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, Sorte OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata); switch (offsetSource) { case ANALYSIS: - return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer()); + return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(), + this::preMultiTermQueryRewrite); case NONE_NEEDED: return NoOpOffsetStrategy.INSTANCE; case TERM_VECTORS: @@ -776,13 +778,14 @@ protected PhraseHelper getPhraseHelper(String field, Query query, EnumSet highlightFlags) { return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY) - ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES)) + ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES), + this::preMultiTermQueryRewrite) : ZERO_LEN_AUTOMATA_ARRAY; } @@ -830,6 +833,32 @@ protected Boolean requiresRewrite(SpanQuery spanQuery) { return null; } + /** + * When highlighting phrases accurately, we may need to handle custom queries that aren't supported in the + * {@link org.apache.lucene.search.highlight.WeightedSpanTermExtractor} as called by the {@link PhraseHelper}. + * Should custom query types be needed, this method should be overriden to return a collection of queries if appropriate, + * or null if nothing to do. If the query is not custom, simply returning null will allow the default rules to apply. + * + * @param query Query to be highlighted + * @return A Collection of Query object(s) if needs to be rewritten, otherwise null. + */ + protected Collection preSpanQueryRewrite(Query query) { + return null; + } + + /** + * When dealing with multi term queries / span queries, we may need to handle custom queries that aren't supported + * by the default automata extraction in {@link MultiTermHighlighting}. This can be overriden to return a collection + * of queries if appropriate, or null if nothing to do. If query is not custom, simply returning null will allow the + * default rules to apply. + * + * @param query Query to be highlighted + * @return A Collection of Query object(s) if needst o be rewritten, otherwise null. + */ + protected Collection preMultiTermQueryRewrite(Query query) { + return null; + } + private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) { return new DocIdSetIterator() { int idx = -1; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 0fd7d3d0a233..9eee6348b16c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -37,6 +37,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; @@ -45,6 +46,7 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; import org.apache.lucene.search.postingshighlight.WholeBreakIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -959,4 +961,84 @@ public String[] format(Passage passages[], String content) { ir.close(); } + public void testBooleanWithSpanAndOverlappingTerms() throws IOException { + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); + Field body = new Field("body", "There is no accord and satisfaction with this - Consideration of the accord is arbitrary.", fieldType); + Document doc = new Document(); + doc.add(body); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + + UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected List preSpanQueryRewrite(Query query) { + if (query instanceof MyQuery) { + return Collections.singletonList(((MyQuery)query).wrapped); + } + return null; + } + }; + + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + Query phraseQuery = new BoostQuery(new PhraseQuery("body", "accord", "and", "satisfaction"), 2.0f); + Query oredTerms = new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(2) + .add(new TermQuery(new Term("body", "accord")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "satisfaction")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "consideration")), BooleanClause.Occur.SHOULD) + .build(); + Query proximityBoostingQuery = new MyQuery(oredTerms); + Query totalQuery = bqBuilder + .add(phraseQuery, BooleanClause.Occur.SHOULD) + .add(proximityBoostingQuery, BooleanClause.Occur.SHOULD) + .build(); + TopDocs topDocs = searcher.search(totalQuery, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighter.highlight("body", totalQuery, topDocs); + assertArrayEquals(new String[]{"There is no accord and satisfaction with this - Consideration of the accord is arbitrary."}, snippets); + + ir.close(); + } + + private static class MyQuery extends Query { + + private final Query wrapped; + + MyQuery(Query wrapped) { + this.wrapped = wrapped; + } + + @Override + public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + return wrapped.createWeight(searcher, needsScores, boost); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newWrapped = wrapped.rewrite(reader); + if (newWrapped != wrapped) { + return new MyQuery(newWrapped); + } + return this; + } + + @Override + public String toString(String field) { + return "[[["+wrapped.toString(field)+"]]]"; + } + + @Override + public boolean equals(Object obj) { + return obj != null && obj.getClass() == getClass() && wrapped.equals(((MyQuery)wrapped).wrapped); + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + } + } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java index 63f0bb1ca5b6..af6487c06a6d 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java @@ -20,6 +20,8 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.List; +import java.util.Objects; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.Analyzer; @@ -56,6 +58,7 @@ import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; @@ -933,4 +936,91 @@ public void testPositionSensitiveWithWildcardDoesNotHighlight() throws Exception ir.close(); } + + public void testCustomSpanQueryHighlighting() throws Exception { + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); + Document doc = new Document(); + doc.add(new Field("body", "alpha bravo charlie delta echo foxtrot golf hotel india juliet", fieldType)); + doc.add(newTextField("id", "id", Field.Store.YES)); + + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected List preMultiTermQueryRewrite(Query query) { + if (query instanceof MyWrapperSpanQuery) { + return Collections.singletonList(((MyWrapperSpanQuery) query).originalQuery); + } + return null; + } + }; + + int docId = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; + + WildcardQuery wildcardQuery = new WildcardQuery(new Term("body", "foxtr*")); + SpanMultiTermQueryWrapper wildcardQueryWrapper = new SpanMultiTermQueryWrapper<>(wildcardQuery); + + + SpanQuery wrappedQuery = new MyWrapperSpanQuery(wildcardQueryWrapper); + + BooleanQuery query = new BooleanQuery.Builder() + .add(wrappedQuery, BooleanClause.Occur.SHOULD) + .build(); + + int[] docIds = new int[]{docId}; + + String snippets[] = highlighter.highlightFields(new String[]{"body"}, query, docIds, new int[]{2}).get("body"); + assertEquals(1, snippets.length); + assertEquals("alpha bravo charlie delta echo foxtrot golf hotel india juliet", snippets[0]); + ir.close(); + } + + private static class MyWrapperSpanQuery extends SpanQuery { + + private final SpanQuery originalQuery; + + private MyWrapperSpanQuery(SpanQuery originalQuery) { + this.originalQuery = Objects.requireNonNull(originalQuery); + } + + @Override + public String getField() { + return originalQuery.getField(); + } + + @Override + public String toString(String field) { + return "(Wrapper[" + originalQuery.toString(field)+"])"; + } + + @Override + public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + return originalQuery.createWeight(searcher, needsScores, boost); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newOriginalQuery = originalQuery.rewrite(reader); + if (newOriginalQuery != originalQuery) { + return new MyWrapperSpanQuery((SpanQuery)newOriginalQuery); + } + return this; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + return originalQuery.equals(((MyWrapperSpanQuery)o).originalQuery); + } + + @Override + public int hashCode() { + return originalQuery.hashCode(); + } + } + } From b71a0990da760cd6ac40be7e4aeee16d2906bac9 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Mon, 7 Nov 2016 20:51:14 -0500 Subject: [PATCH 2/2] Move test into different class more relevant to strict phrase handling --- .../uhighlight/TestUnifiedHighlighter.java | 80 ------------------- .../TestUnifiedHighlighterStrictPhrases.java | 78 ++++++++++++++++++ 2 files changed, 78 insertions(+), 80 deletions(-) diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 9eee6348b16c..fae937df4fc9 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -961,84 +961,4 @@ public String[] format(Passage passages[], String content) { ir.close(); } - public void testBooleanWithSpanAndOverlappingTerms() throws IOException { - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); - Field body = new Field("body", "There is no accord and satisfaction with this - Consideration of the accord is arbitrary.", fieldType); - Document doc = new Document(); - doc.add(body); - iw.addDocument(doc); - IndexReader ir = iw.getReader(); - iw.close(); - - IndexSearcher searcher = newSearcher(ir); - - UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { - @Override - protected List preSpanQueryRewrite(Query query) { - if (query instanceof MyQuery) { - return Collections.singletonList(((MyQuery)query).wrapped); - } - return null; - } - }; - - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - Query phraseQuery = new BoostQuery(new PhraseQuery("body", "accord", "and", "satisfaction"), 2.0f); - Query oredTerms = new BooleanQuery.Builder() - .setMinimumNumberShouldMatch(2) - .add(new TermQuery(new Term("body", "accord")), BooleanClause.Occur.SHOULD) - .add(new TermQuery(new Term("body", "satisfaction")), BooleanClause.Occur.SHOULD) - .add(new TermQuery(new Term("body", "consideration")), BooleanClause.Occur.SHOULD) - .build(); - Query proximityBoostingQuery = new MyQuery(oredTerms); - Query totalQuery = bqBuilder - .add(phraseQuery, BooleanClause.Occur.SHOULD) - .add(proximityBoostingQuery, BooleanClause.Occur.SHOULD) - .build(); - TopDocs topDocs = searcher.search(totalQuery, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); - String[] snippets = highlighter.highlight("body", totalQuery, topDocs); - assertArrayEquals(new String[]{"There is no accord and satisfaction with this - Consideration of the accord is arbitrary."}, snippets); - - ir.close(); - } - - private static class MyQuery extends Query { - - private final Query wrapped; - - MyQuery(Query wrapped) { - this.wrapped = wrapped; - } - - @Override - public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { - return wrapped.createWeight(searcher, needsScores, boost); - } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - Query newWrapped = wrapped.rewrite(reader); - if (newWrapped != wrapped) { - return new MyQuery(newWrapped); - } - return this; - } - - @Override - public String toString(String field) { - return "[[["+wrapped.toString(field)+"]]]"; - } - - @Override - public boolean equals(Object obj) { - return obj != null && obj.getClass() == getClass() && wrapped.equals(((MyQuery)wrapped).wrapped); - } - - @Override - public int hashCode() { - return wrapped.hashCode(); - } - } - } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java index 5fecdc6d5bc9..a60dfde1cf5c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java @@ -17,6 +17,9 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.List; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.MockAnalyzer; @@ -29,14 +32,17 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; @@ -401,4 +407,76 @@ public void testMatchNoDocsQuery() throws IOException { Object o = highlighter.highlightWithoutSearcher("body", new MatchNoDocsQuery(), content, 1); assertEquals(content, o); } + + public void testPreSpanQueryRewrite() throws IOException { + indexWriter.addDocument(newDoc("There is no accord and satisfaction with this - Consideration of the accord is arbitrary.")); + initReaderSearcherHighlighter(); + + highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Collection preSpanQueryRewrite(Query query) { + if (query instanceof MyQuery) { + return Collections.singletonList(((MyQuery)query).wrapped); + } + return null; + } + }; + highlighter.setHighlightPhrasesStrictly(true); + + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + Query phraseQuery = new BoostQuery(new PhraseQuery("body", "accord", "and", "satisfaction"), 2.0f); + Query oredTerms = new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(2) + .add(new TermQuery(new Term("body", "accord")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "satisfaction")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "consideration")), BooleanClause.Occur.SHOULD) + .build(); + Query proximityBoostingQuery = new MyQuery(oredTerms); + Query totalQuery = bqBuilder + .add(phraseQuery, BooleanClause.Occur.SHOULD) + .add(proximityBoostingQuery, BooleanClause.Occur.SHOULD) + .build(); + TopDocs topDocs = searcher.search(totalQuery, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighter.highlight("body", totalQuery, topDocs); + assertArrayEquals(new String[]{"There is no accord and satisfaction with this - Consideration of the accord is arbitrary."}, snippets); + } + + private static class MyQuery extends Query { + + private final Query wrapped; + + MyQuery(Query wrapped) { + this.wrapped = wrapped; + } + + @Override + public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + return wrapped.createWeight(searcher, needsScores, boost); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newWrapped = wrapped.rewrite(reader); + if (newWrapped != wrapped) { + return new MyQuery(newWrapped); + } + return this; + } + + @Override + public String toString(String field) { + return "[[["+wrapped.toString(field)+"]]]"; + } + + @Override + public boolean equals(Object obj) { + return obj != null && obj.getClass() == getClass() && wrapped.equals(((MyQuery)wrapped).wrapped); + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + } }