From 8a12584b246190ee6c8cab94b164835943c04632 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Sun, 24 Jun 2018 23:45:32 -0400 Subject: [PATCH 01/14] LUCENE-8286: UH initial support for Weight.matches --- .../uhighlight/FieldOffsetStrategy.java | 40 ++++++++++++++++++- .../lucene/search/uhighlight/OffsetsEnum.java | 35 ++++++++++++++++ .../search/uhighlight/PhraseHelper.java | 35 +++++++++++++--- .../search/uhighlight/UnifiedHighlighter.java | 11 ++++- 4 files changed, 112 insertions(+), 9 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index cf564f4ddba3..3a4d50863798 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -25,6 +25,10 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.ScoreMode; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -70,9 +74,13 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc final List offsetsEnums = new ArrayList<>(terms.length + automata.length); + final boolean useWeightMatchesIter = phraseHelper.useWeightMatchesIter(); + // Handle position insensitive terms (a subset of this.terms field): final BytesRef[] insensitiveTerms; - if (phraseHelper.hasPositionSensitivity()) { + if (useWeightMatchesIter) { + insensitiveTerms = new BytesRef[0]; // don't -- WEIGHT_MATCHES will handle these + } else if (phraseHelper.hasPositionSensitivity()) { insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms(); assert insensitiveTerms.length <= terms.length : "insensitive terms should be smaller set of all terms"; } else { @@ -82,16 +90,22 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc createOffsetsEnumsForTerms(insensitiveTerms, termsIndex, doc, offsetsEnums); } + // Handle Weight.matches approach + if (useWeightMatchesIter) { + createOffsetsEnumsWeightMatcher(leafReader, doc, offsetsEnums); + } + // Handle spans if (phraseHelper.hasPositionSensitivity()) { phraseHelper.createOffsetsEnumsForSpans(leafReader, doc, offsetsEnums); } // Handle automata - if (automata.length > 0) { + if (automata.length > 0 && useWeightMatchesIter==false) {// WEIGHT_MATCHES handled these already createOffsetsEnumsForAutomata(termsIndex, doc, offsetsEnums); } + //TODO if only one OE then return it; don't wrap in Multi. If none, return NONE return new OffsetsEnum.MultiOffsetsEnum(offsetsEnums); } @@ -111,6 +125,28 @@ protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsInd } } + protected void createOffsetsEnumsWeightMatcher(LeafReader leafReader, int docId, List results) throws IOException { + IndexSearcher indexSearcher = new IndexSearcher(leafReader); + indexSearcher.setQueryCache(null); + Matches matches = indexSearcher.rewrite(phraseHelper.getQuery()) + .createWeight(indexSearcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f) + .matches(leafReader.getContext(), docId); + if (matches == null) { + return; // doc doesn't match + } + for (String field : matches) { + if (phraseHelper.getFieldMatcher().test(field)) { + MatchesIterator iterator = matches.getMatches(field); + if (iterator == null) { + continue; + } + //note: the iterator should be positioned on the first one already + results.add(new OffsetsEnum.OfMatchesIterator(iterator)); + } + } + + } + protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List results) throws IOException { List> automataPostings = new ArrayList<>(automata.length); for (int i = 0; i < automata.length; i++) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java index 55f3c37d4934..8e759b970128 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java @@ -24,6 +24,7 @@ import java.util.PriorityQueue; import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -153,6 +154,39 @@ public int freq() throws IOException { } } + public static class OfMatchesIterator extends OffsetsEnum { + private final MatchesIterator matchesIterator; + + public OfMatchesIterator(MatchesIterator matchesIterator) { + this.matchesIterator = matchesIterator; + } + + @Override + public boolean nextPosition() throws IOException { + return matchesIterator.next(); + } + + @Override + public int freq() throws IOException { + return 1; // nocommit terrible + } + + @Override + public BytesRef getTerm() throws IOException { + return new BytesRef(); // nocommit terrible + } + + @Override + public int startOffset() throws IOException { + return matchesIterator.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return matchesIterator.endOffset(); + } + } + /** * Empty enumeration */ @@ -187,6 +221,7 @@ public int freq() throws IOException { /** * A view over several OffsetsEnum instances, merging them in-place */ + //If OffsetsEnum and MatchesIterator ever truly merge then this could go away in lieu of DisjunctionMatchesIterator public static class MultiOffsetsEnum extends OffsetsEnum { private final PriorityQueue queue; diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java index ab439d09a2bc..dd00b3d7d008 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java @@ -38,7 +38,9 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; @@ -65,13 +67,15 @@ public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", - (s) -> false, spanQuery -> null, query -> null, true); + (s) -> false, spanQuery -> null, query -> null, true, false); + private final Query query; private final String fieldName; private final Set positionInsensitiveTerms; // (TermQuery terms) private final Set spanQueries; private final boolean willRewrite; private final Predicate fieldMatcher; + private final boolean useWeightMatchesIter; /** * Constructor. @@ -86,13 +90,18 @@ public class PhraseHelper { */ public PhraseHelper(Query query, String field, Predicate fieldMatcher, Function rewriteQueryPred, Function> preExtractRewriteFunction, - boolean ignoreQueriesNeedingRewrite) { + boolean ignoreQueriesNeedingRewrite, boolean useWeightMatchesIter) { + this.query = query; this.fieldName = field; this.fieldMatcher = fieldMatcher; + this.useWeightMatchesIter = useWeightMatchesIter; // filter terms to those we want positionInsensitiveTerms = new HashSet<>(); spanQueries = new HashSet<>(); + // if useWeightMatchesIter: + // TODO once SpanQueries implement Weight.matches, we needn't capture them here. But note SpanMultiTermQueryWrapper + // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class @@ -132,6 +141,8 @@ protected void extract(Query query, float boost, Map t for (Query newQuery : newQueriesToExtract) { extract(newQuery, boost, terms); } + } else if (useWeightMatchesIter && (query instanceof PhraseQuery || query instanceof MultiPhraseQuery)) { + return;// do nothing; these queries have Weight impls with proper getMulti } else { super.extract(query, boost, terms); } @@ -149,8 +160,10 @@ protected boolean isQueryUnsupported(Class clazz) { @Override protected void extractWeightedTerms(Map terms, Query query, float boost) throws IOException { - query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, ScoreMode.COMPLETE_NO_SCORES, boost) - .extractTerms(extractPosInsensitiveTermsTarget); + if (!useWeightMatchesIter) { // when useWeightMatchesIterator, we needn't track these terms + query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, ScoreMode.COMPLETE_NO_SCORES, boost) + .extractTerms(extractPosInsensitiveTermsTarget); + } } // called on SpanQueries. Some other position-sensitive queries like PhraseQuery are converted beforehand @@ -185,6 +198,18 @@ protected boolean mustRewriteQuery(SpanQuery spanQuery) { willRewrite = mustRewriteHolder[0]; } + public Query getQuery() { + return query; + } + + public boolean useWeightMatchesIter() { + return useWeightMatchesIter; + } + + public Predicate getFieldMatcher() { + return fieldMatcher; + } + public Set getSpanQueries() { return spanQueries; } @@ -205,7 +230,7 @@ public boolean willRewrite() { return willRewrite; } - /** Returns the terms that are position-insensitive (sorted). */ + /** Returns the terms that are position-insensitive (sorted). Always empty if {@link #useWeightMatchesIter}. */ public BytesRef[] getAllPositionInsensitiveTerms() { BytesRef[] result = positionInsensitiveTerms.toArray(new BytesRef[positionInsensitiveTerms.size()]); Arrays.sort(result); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index cfc918f878c6..a52f99bbe785 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -769,6 +769,7 @@ protected static BytesRef[] filterExtractedTerms(Predicate fieldMatcher, protected Set getFlags(String field) { Set highlightFlags = EnumSet.noneOf(HighlightFlag.class); + highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);//nocommit for experimentation, use as default if (shouldHandleMultiTermQuery(field)) { highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY); } @@ -784,9 +785,14 @@ protected Set getFlags(String field) { protected PhraseHelper getPhraseHelper(String field, Query query, Set highlightFlags) { boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES); boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY); + boolean useWeightMatchesIter = highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES); return highlightPhrasesStrictly ? new PhraseHelper(query, field, getFieldMatcher(field), - this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : PhraseHelper.NONE; + this::requiresRewrite, + this::preSpanQueryRewrite, + !handleMultiTermQuery, + useWeightMatchesIter) + : PhraseHelper.NONE; } protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set highlightFlags) { @@ -1090,7 +1096,8 @@ public CacheHelper getReaderCacheHelper() { public enum HighlightFlag { PHRASES, MULTI_TERM_QUERY, - PASSAGE_RELEVANCY_OVER_SPEED + PASSAGE_RELEVANCY_OVER_SPEED, + WEIGHT_MATCHES // TODO: ignoreQueryFields // TODO: useQueryBoosts } From d01b968ca46434e4282207e50260449713c6fbb2 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Wed, 11 Jul 2018 16:13:22 -0400 Subject: [PATCH 02/14] LUCENE-8286: Refactor out a UHComponents of FieldOffsetStrategy constructor (to reduce number of args to all the OffsetStrategy impls) --- .../uhighlight/AnalysisOffsetStrategy.java | 6 +- .../uhighlight/FieldOffsetStrategy.java | 13 ++-- .../uhighlight/MemoryIndexOffsetStrategy.java | 9 ++- .../search/uhighlight/NoOpOffsetStrategy.java | 2 +- .../uhighlight/PostingsOffsetStrategy.java | 8 +-- ...PostingsWithTermVectorsOffsetStrategy.java | 6 +- .../uhighlight/TermVectorOffsetStrategy.java | 6 +- .../uhighlight/TokenStreamOffsetStrategy.java | 10 ++- .../search/uhighlight/UHComponents.java | 64 +++++++++++++++++++ .../search/uhighlight/UnifiedHighlighter.java | 12 ++-- .../TestUnifiedHighlighterExtensibility.java | 6 +- 11 files changed, 103 insertions(+), 39 deletions(-) create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java index 162d27003947..6b0b60804862 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java @@ -23,8 +23,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Provides a base class for analysis based offset strategies to extend from. @@ -37,8 +35,8 @@ public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy { protected final Analyzer analyzer; - public AnalysisOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) { - super(field, queryTerms, phraseHelper, automata); + public AnalysisOffsetStrategy(UHComponents components, Analyzer analyzer) { + super(components); this.analyzer = analyzer; if (analyzer.getOffsetGap(field) != 1) { // note: 1 is the default. It is RARELY changed. throw new IllegalArgumentException( diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index 3a4d50863798..4e3be826cff8 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -41,16 +41,19 @@ */ public abstract class FieldOffsetStrategy { + protected final UHComponents components; + //nocommit remove these?: protected final String field; protected final PhraseHelper phraseHelper; // Query: position-sensitive information protected final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive) protected final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive - public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { - this.field = field; - this.terms = queryTerms; - this.phraseHelper = phraseHelper; - this.automata = automata; + public FieldOffsetStrategy(UHComponents components) { + this.components = components; + this.field = components.getField(); + this.terms = components.getExtractedTerms(); + this.phraseHelper = components.getPhraseHelper(); + this.automata = components.getAutomata(); } public String getField() { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index 6364f3fcd5cd..a3b84746a023 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -50,15 +50,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { private final LeafReader leafReader; private final CharacterRunAutomaton preMemIndexFilterAutomaton; - public MemoryIndexOffsetStrategy(String field, Predicate fieldMatcher, BytesRef[] extractedTerms, PhraseHelper phraseHelper, - CharacterRunAutomaton[] automata, Analyzer analyzer, + public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer, Function> multiTermQueryRewrite) { - super(field, extractedTerms, phraseHelper, automata, analyzer); - boolean storePayloads = phraseHelper.hasPositionSensitivity(); // might be needed + super(components, analyzer); + boolean storePayloads = components.getPhraseHelper().hasPositionSensitivity(); // might be needed memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable // preFilter for MemoryIndex - preMemIndexFilterAutomaton = buildCombinedAutomaton(fieldMatcher, terms, this.automata, phraseHelper, multiTermQueryRewrite); + preMemIndexFilterAutomaton = buildCombinedAutomaton(components.getFieldMatcher(), terms, this.automata, components.getPhraseHelper(), multiTermQueryRewrite); } /** diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java index 44a23f707294..8e9e7aa67578 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java @@ -32,7 +32,7 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy { public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy(); private NoOpOffsetStrategy() { - super("_ignored_", new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0]); + super(new UHComponents("_ignored_", (s) -> false, new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0])); } @Override diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java index b7df77a8f297..e848a3e90526 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java @@ -22,10 +22,6 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.ReaderUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Uses offsets in postings -- {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. This @@ -35,8 +31,8 @@ */ public class PostingsOffsetStrategy extends FieldOffsetStrategy { - public PostingsOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { - super(field, queryTerms, phraseHelper, automata); + public PostingsOffsetStrategy(UHComponents components) { + super(components); } @Override diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java index 9097e4d68b5a..e0eb3212abf2 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java @@ -24,8 +24,6 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Terms; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Like {@link PostingsOffsetStrategy} but also uses term vectors (only terms needed) for multi-term queries. @@ -34,8 +32,8 @@ */ public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy { - public PostingsWithTermVectorsOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { - super(field, queryTerms, phraseHelper, automata); + public PostingsWithTermVectorsOffsetStrategy(UHComponents components) { + super(components); } @Override diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java index cd19bb9d0d4b..5d0e0b933f32 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java @@ -22,8 +22,6 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Terms; import org.apache.lucene.search.highlight.TermVectorLeafReader; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Uses term vectors that contain offsets. @@ -32,8 +30,8 @@ */ public class TermVectorOffsetStrategy extends FieldOffsetStrategy { - public TermVectorOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { - super(field, queryTerms, phraseHelper, automata); + public TermVectorOffsetStrategy(UHComponents components) { + super(components); } @Override diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java index b81e63e282e9..fbe3246e27eb 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java @@ -36,8 +36,14 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0]; - public TokenStreamOffsetStrategy(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer indexAnalyzer) { - super(field, ZERO_LEN_BYTES_REF_ARRAY, phraseHelper, convertTermsToAutomata(terms, automata), indexAnalyzer); + public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) { + super(new UHComponents( + components.getField(), + components.getFieldMatcher(), + ZERO_LEN_BYTES_REF_ARRAY, + components.getPhraseHelper(), + convertTermsToAutomata(components.getExtractedTerms(), components.getAutomata())), + indexAnalyzer); assert phraseHelper.hasPositionSensitivity() == false; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java new file mode 100644 index 000000000000..a8ea1c0273cf --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.uhighlight; + +import java.util.function.Predicate; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; + +/** + * A parameter object to hold the components a {@link FieldOffsetStrategy} needs. + * + * @lucene.internal + */ +public class UHComponents { + private final String field; + private final Predicate fieldMatcher; + private final BytesRef[] extractedTerms; + private final PhraseHelper phraseHelper; + private final CharacterRunAutomaton[] automata; + + public UHComponents(String field, Predicate fieldMatcher, BytesRef[] extractedTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { + this.field = field; + this.fieldMatcher = fieldMatcher; + this.extractedTerms = extractedTerms; + this.phraseHelper = phraseHelper; + this.automata = automata; + } + + public String getField() { + return field; + } + + public Predicate getFieldMatcher() { + return fieldMatcher; + } + + public BytesRef[] getExtractedTerms() { + return extractedTerms; + } + + public PhraseHelper getPhraseHelper() { + return phraseHelper; + } + + public CharacterRunAutomaton[] getAutomata() { + return automata; + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index a52f99bbe785..d604df7e3186 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -838,24 +838,24 @@ protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set highlightFlags) { + final UHComponents components = new UHComponents(field, getFieldMatcher(field), terms, phraseHelper, automata); switch (offsetSource) { case ANALYSIS: if (!phraseHelper.hasPositionSensitivity() && !highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) { //skip using a memory index since it's pure term filtering - return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer()); + return new TokenStreamOffsetStrategy(components, getIndexAnalyzer()); } else { - return new MemoryIndexOffsetStrategy(field, getFieldMatcher(field), terms, phraseHelper, automata, getIndexAnalyzer(), - this::preMultiTermQueryRewrite); + return new MemoryIndexOffsetStrategy(components, getIndexAnalyzer(), this::preMultiTermQueryRewrite); } case NONE_NEEDED: return NoOpOffsetStrategy.INSTANCE; case TERM_VECTORS: - return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata); + return new TermVectorOffsetStrategy(components); case POSTINGS: - return new PostingsOffsetStrategy(field, terms, phraseHelper, automata); + return new PostingsOffsetStrategy(components); case POSTINGS_WITH_TERM_VECTORS: - return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata); + return new PostingsWithTermVectorsOffsetStrategy(components); default: throw new IllegalArgumentException("Unrecognized offset source " + offsetSource); } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java index 4eaa821b3c37..742117be1aeb 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java @@ -40,6 +40,7 @@ import org.apache.lucene.search.uhighlight.PassageScorer; import org.apache.lucene.search.uhighlight.PhraseHelper; import org.apache.lucene.search.uhighlight.SplittingBreakIterator; +import org.apache.lucene.search.uhighlight.UHComponents; import org.apache.lucene.search.uhighlight.UnifiedHighlighter; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -58,10 +59,11 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase { @Test public void testFieldOffsetStrategyExtensibility() { final UnifiedHighlighter.OffsetSource offsetSource = UnifiedHighlighter.OffsetSource.NONE_NEEDED; - FieldOffsetStrategy strategy = new FieldOffsetStrategy("field", + FieldOffsetStrategy strategy = new FieldOffsetStrategy(new UHComponents("field", + (s) -> false, new BytesRef[0], PhraseHelper.NONE, - new CharacterRunAutomaton[0]) { + new CharacterRunAutomaton[0])) { @Override public UnifiedHighlighter.OffsetSource getOffsetSource() { return offsetSource; From df8398173ddbff2227ffb002c66020da00bc72d0 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Wed, 11 Jul 2018 17:13:41 -0400 Subject: [PATCH 03/14] LUCENE-8286 OffsetsEnum should order by endOffset following startOffset. Was a non-issue when OE == term but more of an issue in other cases. Now matches order within the Passage is more sane in such cases. --- .../java/org/apache/lucene/search/uhighlight/OffsetsEnum.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java index 8e759b970128..906117247c2f 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java @@ -47,6 +47,10 @@ public int compareTo(OffsetsEnum other) { if (cmp != 0) { return cmp; // vast majority of the time we return here. } + cmp = Integer.compare(endOffset(), other.endOffset()); + if (cmp != 0) { + return cmp; + } final BytesRef thisTerm = this.getTerm(); final BytesRef otherTerm = other.getTerm(); if (thisTerm == null || otherTerm == null) { From 73f8888a812da5aa83dd80bfbb07c9a9dd420c6f Mon Sep 17 00:00:00 2001 From: David Smiley Date: Wed, 11 Jul 2018 17:16:30 -0400 Subject: [PATCH 04/14] LUCENE-8286 UH Formatter: Overlapping matches within passage should be merged. When matches overlap, the formatted passage could contain weird end/open pairs. Formatting as if the matches were merged is more sane I think. --- .../uhighlight/DefaultPassageFormatter.java | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java index 62d58df70f08..8d2a424d1004 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java @@ -70,17 +70,24 @@ public String format(Passage passages[], String content) { pos = passage.getStartOffset(); for (int i = 0; i < passage.getNumMatches(); i++) { int start = passage.getMatchStarts()[i]; + assert start >= pos && start < passage.getEndOffset(); + //append content before this start + append(sb, content, pos, start); + int end = passage.getMatchEnds()[i]; - // its possible to have overlapping terms - if (start > pos) { - append(sb, content, pos, start); - } - if (end > pos) { - sb.append(preTag); - append(sb, content, Math.max(pos, start), end); - sb.append(postTag); - pos = end; + assert end > start; + // its possible to have overlapping terms. + // Look ahead to expand 'end' past all overlapping: + while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) { + end = passage.getMatchEnds()[++i]; } + end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage + + sb.append(preTag); + append(sb, content, start, end); + sb.append(postTag); + + pos = end; } // its possible a "term" from the analyzer could span a sentence boundary. append(sb, content, pos, Math.max(pos, passage.getEndOffset())); From 93c4663a763decdf0d4bff2d9c7520045db81b74 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Wed, 11 Jul 2018 17:24:22 -0400 Subject: [PATCH 05/14] LUCENE-8286 UH: Support for requireFieldMatch/fieldMatcher when using Weight Matches. --- .../uhighlight/FieldOffsetStrategy.java | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index 4e3be826cff8..a1ff51ba4041 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -128,7 +128,29 @@ protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsInd } } - protected void createOffsetsEnumsWeightMatcher(LeafReader leafReader, int docId, List results) throws IOException { + protected void createOffsetsEnumsWeightMatcher(LeafReader _leafReader, int docId, List results) throws IOException { + // remap fieldMatcher/requireFieldMatch fields to the field we are highlighting + LeafReader leafReader = new FilterLeafReader(_leafReader) { + @Override + public Terms terms(String field) throws IOException { + if (components.getFieldMatcher().test(field)) { + return super.terms(components.getField()); + } else { + return super.terms(field); + } + } + + //these ought to be a default? So many subclasses do this! + @Override + public CacheHelper getCoreCacheHelper() { + return null; + } + + @Override + public CacheHelper getReaderCacheHelper() { + return null; + } + }; IndexSearcher indexSearcher = new IndexSearcher(leafReader); indexSearcher.setQueryCache(null); Matches matches = indexSearcher.rewrite(phraseHelper.getQuery()) From a3ef8be7c981f90b50385832283f460773af5304 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Wed, 11 Jul 2018 17:30:01 -0400 Subject: [PATCH 06/14] forgot these imports during selective committing --- .../apache/lucene/search/uhighlight/FieldOffsetStrategy.java | 1 + .../apache/lucene/search/uhighlight/PostingsOffsetStrategy.java | 2 ++ 2 files changed, 3 insertions(+) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index a1ff51ba4041..086a3d5752a5 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.PostingsEnum; diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java index e848a3e90526..469ce440730f 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java @@ -22,6 +22,8 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.ReaderUtil; /** * Uses offsets in postings -- {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. This From 42947da3892b325308baf89ad5dbde521226b0ff Mon Sep 17 00:00:00 2001 From: David Smiley Date: Wed, 11 Jul 2018 17:57:35 -0400 Subject: [PATCH 07/14] LUCENE-8286 Overlay LeafReader with offsets with the real LeafReader when different. Weight Matches highlighting needs this for some cases; especially when requireFieldMatch/fieldMatcher is in use. Refactored FieldOffsetStrategy.getOffsetsEnum to take a LeafReader not IndexReader. --- .../search/uhighlight/FieldHighlighter.java | 4 +- .../uhighlight/FieldOffsetStrategy.java | 3 +- .../uhighlight/MemoryIndexOffsetStrategy.java | 21 ++-- .../search/uhighlight/NoOpOffsetStrategy.java | 4 +- .../OverlaySingleDocTermsLeafReader.java | 113 ++++++++++++++++++ .../uhighlight/PostingsOffsetStrategy.java | 18 +-- ...PostingsWithTermVectorsOffsetStrategy.java | 18 +-- .../TermVectorFilteredLeafReader.java | 11 +- .../uhighlight/TermVectorOffsetStrategy.java | 15 ++- .../uhighlight/TokenStreamOffsetStrategy.java | 4 +- .../search/uhighlight/UnifiedHighlighter.java | 21 +++- .../TestUnifiedHighlighterExtensibility.java | 5 +- 12 files changed, 174 insertions(+), 63 deletions(-) create mode 100644 lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OverlaySingleDocTermsLeafReader.java diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java index da6e5256ac5b..219091ac829f 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java @@ -24,7 +24,7 @@ import java.util.List; import java.util.PriorityQueue; -import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.util.BytesRef; /** @@ -65,7 +65,7 @@ public UnifiedHighlighter.OffsetSource getOffsetSource() { /** * The primary method -- highlight this doc, assuming a specific field and given this content. */ - public Object highlightFieldForDoc(IndexReader reader, int docId, String content) throws IOException { + public Object highlightFieldForDoc(LeafReader reader, int docId, String content) throws IOException { // TODO accept LeafReader instead? // note: it'd be nice to accept a CharSequence for content, but we need a CharacterIterator impl for it. if (content.length() == 0) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index 086a3d5752a5..7898ce148029 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -21,7 +21,6 @@ import java.util.List; import org.apache.lucene.index.FilterLeafReader; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; @@ -68,7 +67,7 @@ public String getField() { * * Callers are expected to close the returned OffsetsEnum when it has been finished with */ - public abstract OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) throws IOException; + public abstract OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException; protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc) throws IOException { final Terms termsIndex = leafReader.terms(field); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index a3b84746a023..a35591b71ae4 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -29,7 +29,6 @@ import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.search.Query; @@ -47,7 +46,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { private final MemoryIndex memoryIndex; - private final LeafReader leafReader; + private final LeafReader memIndexLeafReader; private final CharacterRunAutomaton preMemIndexFilterAutomaton; public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer, @@ -55,7 +54,7 @@ public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer, super(components, analyzer); boolean storePayloads = components.getPhraseHelper().hasPositionSensitivity(); // might be needed memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets - leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable + memIndexLeafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable // preFilter for MemoryIndex preMemIndexFilterAutomaton = buildCombinedAutomaton(components.getFieldMatcher(), terms, this.automata, components.getPhraseHelper(), multiTermQueryRewrite); } @@ -99,7 +98,7 @@ public boolean run(char[] chars, int offset, int length) { } @Override - public OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) throws IOException { + public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException { // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength TokenStream tokenStream = tokenStream(content); @@ -107,12 +106,20 @@ public OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton); memoryIndex.reset(); memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close() - docId = 0; - return createOffsetsEnumFromReader(leafReader, docId); + if (reader == null) { + return createOffsetsEnumFromReader(memIndexLeafReader, 0); + } else { + return createOffsetsEnumFromReader( + new OverlaySingleDocTermsLeafReader( + reader, + memIndexLeafReader, + field, + docId), + docId); + } } - private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream, final CharacterRunAutomaton charRunAutomaton) { // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface? diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java index 8e9e7aa67578..e42ff2353f1e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java @@ -18,7 +18,7 @@ import java.io.IOException; -import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -41,7 +41,7 @@ public UnifiedHighlighter.OffsetSource getOffsetSource() { } @Override - public OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) throws IOException { + public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException { return OffsetsEnum.EMPTY; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OverlaySingleDocTermsLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OverlaySingleDocTermsLeafReader.java new file mode 100644 index 000000000000..7b9412d9739a --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OverlaySingleDocTermsLeafReader.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.uhighlight; + +import java.io.IOException; + +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; + +/** + * Overlays a 2nd LeafReader for the terms of one field, otherwise the primary reader is + * consulted. The 2nd reader is assumed to have one document of 0 and we remap it to a target doc ID. + * + * @lucene.experimental + */ +public class OverlaySingleDocTermsLeafReader extends FilterLeafReader { + + private final LeafReader in2; + private final String in2Field; + private final int in2TargetDocId; + + public OverlaySingleDocTermsLeafReader(LeafReader in, LeafReader in2, String in2Field, int in2TargetDocId) { + super(in); + this.in2 = in2; + this.in2Field = in2Field; + this.in2TargetDocId = in2TargetDocId; + assert in2.maxDoc() == 1; + } + + @Override + public Terms terms(String field) throws IOException { + if (!in2Field.equals(field)) { + return in.terms(field); + } + + // Shifts leafReader in2 with only doc ID 0 to a target doc ID + final Terms terms = in2.terms(field); + if (terms == null) { + return null; + } + if (in2TargetDocId == 0) { // no doc ID remapping to do + return terms; + } + return new FilterTerms(terms) { + @Override + public TermsEnum iterator() throws IOException { + return filterTermsEnum(super.iterator()); + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + return filterTermsEnum(super.intersect(compiled, startTerm)); + } + + private TermsEnum filterTermsEnum(TermsEnum termsEnum) throws IOException { + return new FilterTermsEnum(termsEnum) { + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + //TODO 'reuse' will always fail to reuse unless we unwrap it + return new FilterPostingsEnum(super.postings(reuse, flags)) { + @Override + public int nextDoc() throws IOException { + final int doc = super.nextDoc(); + return doc == 0 ? in2TargetDocId : doc; + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + + @Override + public int docID() { + final int doc = super.docID(); + return doc == 0 ? in2TargetDocId : doc; + } + }; + } + }; + } + }; + } + + @Override + public CacheHelper getCoreCacheHelper() { + return null; + } + + @Override + public CacheHelper getReaderCacheHelper() { + return null; + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java index 469ce440730f..53c1c95b4b6c 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java @@ -17,13 +17,9 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; -import java.util.List; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.ReaderUtil; /** * Uses offsets in postings -- {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. This @@ -38,18 +34,8 @@ public PostingsOffsetStrategy(UHComponents components) { } @Override - public OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) throws IOException { - final LeafReader leafReader; - if (reader instanceof LeafReader) { - leafReader = (LeafReader) reader; - } else { - List leaves = reader.leaves(); - LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves)); - leafReader = leafReaderContext.reader(); - docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader - } - - return createOffsetsEnumFromReader(leafReader, docId); + public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException { + return createOffsetsEnumFromReader(reader, docId); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java index e0eb3212abf2..ab391213aba8 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java @@ -17,12 +17,8 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; -import java.util.List; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Terms; /** @@ -37,22 +33,12 @@ public PostingsWithTermVectorsOffsetStrategy(UHComponents components) { } @Override - public OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) throws IOException { - LeafReader leafReader; - if (reader instanceof LeafReader) { - leafReader = (LeafReader) reader; - } else { - List leaves = reader.leaves(); - LeafReaderContext LeafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves)); - leafReader = LeafReaderContext.reader(); - docId -= LeafReaderContext.docBase; // adjust 'doc' to be within this atomic reader - } - + public OffsetsEnum getOffsetsEnum(LeafReader leafReader, int docId, String content) throws IOException { Terms docTerms = leafReader.getTermVector(docId, field); if (docTerms == null) { return OffsetsEnum.EMPTY; } - leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms); + leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms, field); return createOffsetsEnumFromReader(leafReader, docId); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java index 4165af45a9e8..4b93e16af053 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorFilteredLeafReader.java @@ -37,21 +37,26 @@ final class TermVectorFilteredLeafReader extends FilterLeafReader { // NOTE: super ("in") is baseLeafReader private final Terms filterTerms; + private final String fieldFilter; /** *

Construct a FilterLeafReader based on the specified base reader. *

Note that base reader is closed if this FilterLeafReader is closed.

- * * @param baseLeafReader full/original reader. * @param filterTerms set of terms to filter by -- probably from a TermVector or MemoryIndex. + * @param fieldFilter the field to do this on */ - TermVectorFilteredLeafReader(LeafReader baseLeafReader, Terms filterTerms) { + TermVectorFilteredLeafReader(LeafReader baseLeafReader, Terms filterTerms, String fieldFilter) { super(baseLeafReader); this.filterTerms = filterTerms; + this.fieldFilter = fieldFilter; } @Override public Terms terms(String field) throws IOException { + if (!field.equals(fieldFilter)) { + return super.terms(field); // proceed like normal for fields we're not interested in + } Terms terms = in.terms(field); return terms==null ? null : new TermsFilteredTerms(terms, filterTerms); } @@ -106,7 +111,7 @@ void moveToCurrentTerm() throws IOException { boolean termInBothTermsEnum = baseTermsEnum.seekExact(currentTerm); if (!termInBothTermsEnum) { - throw new IllegalStateException("Term vector term " + currentTerm + " does not appear in full index."); + throw new IllegalStateException("Term vector term '" + currentTerm.utf8ToString() + "' does not appear in full index."); } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java index 5d0e0b933f32..895e95e50919 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java @@ -18,7 +18,6 @@ import java.io.IOException; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Terms; import org.apache.lucene.search.highlight.TermVectorLeafReader; @@ -40,16 +39,20 @@ public UnifiedHighlighter.OffsetSource getOffsetSource() { } @Override - public OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) throws IOException { + public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException { Terms tvTerms = reader.getTermVector(docId, field); if (tvTerms == null) { return OffsetsEnum.EMPTY; } - LeafReader leafReader = new TermVectorLeafReader(field, tvTerms); - docId = 0; - - return createOffsetsEnumFromReader(leafReader, docId); + LeafReader singleDocReader = new TermVectorLeafReader(field, tvTerms); + return createOffsetsEnumFromReader( + new OverlaySingleDocTermsLeafReader( + reader, + singleDocReader, + field, + docId), + docId); } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java index fbe3246e27eb..e851ca660cdd 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java @@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -64,7 +64,7 @@ public String toString() { } @Override - public OffsetsEnum getOffsetsEnum(IndexReader reader, int docId, String content) throws IOException { + public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException { return new TokenStreamOffsetsEnum(tokenStream(content), automata); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index d604df7e3186..d55ec3fefefa 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -48,6 +48,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSetIterator; @@ -623,11 +624,20 @@ protected Map highlightFieldsAsObjects(String[] fieldsIn, Quer && indexReaderWithTermVecCache != null) ? indexReaderWithTermVecCache : searcher.getIndexReader(); + final LeafReader leafReader; + if (indexReader instanceof LeafReader) { + leafReader = (LeafReader) indexReader; + } else { + List leaves = indexReader.leaves(); + LeafReaderContext leafReaderContext = leaves.get(ReaderUtil.subIndex(docId, leaves)); + leafReader = leafReaderContext.reader(); + docId -= leafReaderContext.docBase; // adjust 'doc' to be within this leaf reader + } int docInIndex = docInIndexes[docIdx];//original input order assert resultByDocIn[docInIndex] == null; resultByDocIn[docInIndex] = fieldHighlighter - .highlightFieldForDoc(indexReader, docId, content.toString()); + .highlightFieldForDoc(leafReader, docId, content.toString()); } } @@ -769,13 +779,15 @@ protected static BytesRef[] filterExtractedTerms(Predicate fieldMatcher, protected Set getFlags(String field) { Set highlightFlags = EnumSet.noneOf(HighlightFlag.class); - highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);//nocommit for experimentation, use as default if (shouldHandleMultiTermQuery(field)) { highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY); } if (shouldHighlightPhrasesStrictly(field)) { highlightFlags.add(HighlightFlag.PHRASES); } + if (highlightFlags.contains(HighlightFlag.PHRASES) && highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)) { + highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);//nocommit for experimentation, use as default + } if (shouldPreferPassageRelevancyOverSpeed(field)) { highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED); } @@ -791,7 +803,7 @@ protected PhraseHelper getPhraseHelper(String field, Query query, Set Date: Wed, 11 Jul 2018 17:59:47 -0400 Subject: [PATCH 08/14] LUCENE-8286 UH: Update some tests to work with Weight Matcher mode --- .../uhighlight/TestUnifiedHighlighter.java | 82 ++++++++++++++++--- .../TestUnifiedHighlighterStrictPhrases.java | 42 ++++++---- .../TestUnifiedHighlighterTermVec.java | 34 ++++++-- 3 files changed, 123 insertions(+), 35 deletions(-) diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index fa6f2867dbc3..d89499cd5e83 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -95,8 +95,12 @@ static UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analy return new UnifiedHighlighter(searcher, indexAnalyzer); } else { final UnifiedHighlighter uh = new UnifiedHighlighter(searcher, indexAnalyzer) { + Set flags; // consistently random set of flags for this test run @Override protected Set getFlags(String field) { + if (flags != null) { + return flags; + } final EnumSet result = EnumSet.copyOf(mandatoryFlags); int r = random().nextInt(); for (HighlightFlag highlightFlag : HighlightFlag.values()) { @@ -104,7 +108,12 @@ protected Set getFlags(String field) { result.add(highlightFlag); } } - return result; + if (result.contains(HighlightFlag.WEIGHT_MATCHES)) { + // these two are required for WEIGHT_MATCHES + result.add(HighlightFlag.MULTI_TERM_QUERY); + result.add(HighlightFlag.PHRASES); + } + return flags = result; } }; uh.setCacheFieldValCharsThreshold(random().nextInt(100)); @@ -420,7 +429,11 @@ public void testBuddhism() throws Exception { highlighter.setHighlightPhrasesStrictly(false); String snippets[] = highlighter.highlight("body", query, topDocs, 2); assertEquals(1, snippets.length); - assertTrue(snippets[0].contains("Buddhist origins")); + if (highlighter.getFlags("body").containsAll(EnumSet.of(HighlightFlag.WEIGHT_MATCHES, HighlightFlag.PHRASES))) { + assertTrue(snippets[0], snippets[0].contains("Buddhist origins")); + } else { + assertTrue(snippets[0], snippets[0].contains("Buddhist origins")); + } ir.close(); } @@ -1187,6 +1200,19 @@ protected Predicate getFieldMatcher(String field) { ir.close(); } + public void testMatchesSlopBug() throws IOException { + IndexReader ir = indexSomeFields(); + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); + Query query = new PhraseQuery(2, "title", "this", "is", "the", "field"); + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighter.highlight("title", query, topDocs, 10); + assertEquals(1, snippets.length); + assertEquals("This is the title field.", snippets[0]); + ir.close(); + } + public void testFieldMatcherPhraseQuery() throws Exception { IndexReader ir = indexSomeFields(); IndexSearcher searcher = newSearcher(ir); @@ -1197,7 +1223,7 @@ protected Predicate getFieldMatcher(String field) { return (qf) -> true; } }; - UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.PHRASES)); + UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.PHRASES, HighlightFlag.MULTI_TERM_QUERY)); highlighterFieldMatch.setFieldMatcher(null);//default BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder() @@ -1214,16 +1240,28 @@ protected Predicate getFieldMatcher(String field) { assertEquals(1, topDocs.totalHits.value); String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the title field.", snippets[0]); + if (highlighterNoFieldMatch.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the title field.", snippets[0]); + } else { + assertEquals("This is the title field.", snippets[0]); + } snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the title field.", snippets[0]); + if (highlighterFieldMatch.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the title field.", snippets[0]); + } else { + assertEquals("This is the title field.", snippets[0]); + } highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq)); snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the title field.", snippets[0]); + if (highlighterFieldMatch.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the title field.", snippets[0]); + } else { + assertEquals("This is the title field.", snippets[0]); + } highlighterFieldMatch.setFieldMatcher(null); } @@ -1233,11 +1271,20 @@ protected Predicate getFieldMatcher(String field) { assertEquals(1, topDocs.totalHits.value); String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + if (highlighterNoFieldMatch.getFlags("text").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + } else { + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + } snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + if (highlighterFieldMatch.getFlags("text").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + } else { + //nocommit why does this highlight the first "text"? + assertEquals("This is the text field. You can put some text if you want.", snippets[0]); + } highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq)); snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10); @@ -1252,17 +1299,28 @@ protected Predicate getFieldMatcher(String field) { assertEquals(1, topDocs.totalHits.value); String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the category field.", snippets[0]); + if (highlighterNoFieldMatch.getFlags("category").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the category field.", snippets[0]); + } else { + assertEquals("This is the category field.", snippets[0]); + } snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the category field.", snippets[0]); - + if (highlighterFieldMatch.getFlags("category").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the category field.", snippets[0]); + } else { + assertEquals("This is the category field.", snippets[0]); + } highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq)); snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the category field.", snippets[0]); + if (highlighterFieldMatch.getFlags("category").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the category field.", snippets[0]); + } else { + assertEquals("This is the category field.", snippets[0]); + } highlighterFieldMatch.setFieldMatcher(null); } ir.close(); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java index 2fefaba26f6a..ee9b9d224a63 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java @@ -20,6 +20,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.Set; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.MockAnalyzer; @@ -52,6 +53,7 @@ import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag; import org.apache.lucene.store.Directory; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; @@ -138,7 +140,7 @@ public void testBasics() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"yin alone, Yin yang, yin gap yang"}, snippets); + assertArrayEquals(new String[]{"yin alone, Yin yang, yin gap yang"}, snippets); } public void testWithSameTermQuery() throws IOException { @@ -155,7 +157,7 @@ public void testWithSameTermQuery() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"Yin yang, yin gap yang"}, snippets); + assertArrayEquals(new String[]{"Yin yang, yin gap yang"}, snippets); // test the Passage only has 3 matches. We don't want duplicates from "Yin" being in TermQuery & PhraseQuery. highlighter.setFormatter(new PassageFormatter() { @@ -199,7 +201,7 @@ public void testSubPhrases() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); // test the Passage only has 3 matches. We don't want duplicates from both PhraseQuery highlighter.setFormatter(new PassageFormatter() { @@ -224,7 +226,7 @@ public void testSynonyms() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"mother father w mom father w dad"}, snippets); + assertArrayEquals(new String[]{"mother father w mom father w dad"}, snippets); } /** @@ -252,7 +254,7 @@ public void testRewriteAndMtq() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); // do again, this time with MTQ disabled. We should only find "alpha bravo". @@ -288,7 +290,7 @@ public void testRewrite() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); // do again, this time with MTQ disabled. We should only find "alpha bravo". @@ -325,7 +327,7 @@ public void testMtq() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); // do again, this time with MTQ disabled. @@ -334,6 +336,7 @@ public void testMtq() throws IOException { topDocs = searcher.search(query, 10, Sort.INDEXORDER); snippets = highlighter.highlight("body", query, topDocs); + //note: without MTQ, the WEIGHT_MATCHES is disabled which affects the snippet boundaries assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); } @@ -351,7 +354,7 @@ public void testMultiValued() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs, 2); - assertArrayEquals(new String[]{"one bravo three... four bravo six"}, + assertArrayEquals(new String[]{"one bravo three... four bravo six"}, snippets); @@ -381,7 +384,11 @@ public void testMultiValued() throws IOException { topDocs = searcher.search(query, 10); assertEquals(1, topDocs.totalHits.value); snippets = highlighter.highlight("body", query, topDocs, 2); - assertEquals("one bravo three... four bravo six", snippets[0]); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("one bravo three... four bravo six", snippets[0]); + } else { + assertEquals("one bravo three... four bravo six", snippets[0]); + } } public void testMaxLen() throws IOException { @@ -390,18 +397,18 @@ public void testMaxLen() throws IOException { highlighter.setMaxLength(21); BooleanQuery query = new BooleanQuery.Builder() - .add(newPhraseQuery("body", "alpha bravo"), BooleanClause.Occur.MUST) - .add(newPhraseQuery("body", "gap alpha"), BooleanClause.Occur.MUST) + .add(newPhraseQuery("body", "alpha bravo"), BooleanClause.Occur.SHOULD) + .add(newPhraseQuery("body", "gap alpha"), BooleanClause.Occur.SHOULD) .add(newPhraseQuery("body", "charlie gap"), BooleanClause.Occur.SHOULD) .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - if (fieldType == UHTestHelper.reanalysisType) { - assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); + if (fieldType == UHTestHelper.reanalysisType || highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); } else { - assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); + assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); } } @@ -436,6 +443,13 @@ public void testPreSpanQueryRewrite() throws IOException { initReaderSearcherHighlighter(); highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Set getFlags(String field) { + final Set flags = super.getFlags(field); + flags.remove(HighlightFlag.WEIGHT_MATCHES);//unsupported + return flags; + } + @Override protected Collection preSpanQueryRewrite(Query query) { if (query instanceof MyQuery) { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java index 2d9fb0e18df1..6a92546d8f00 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java @@ -16,13 +16,31 @@ */ package org.apache.lucene.search.uhighlight; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.*; +import org.apache.lucene.index.CheckIndex; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterDirectoryReader; +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.ParallelLeafReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; @@ -35,13 +53,6 @@ import org.junit.Before; import org.junit.Test; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; -import java.util.List; -import java.util.Map; - /** * Tests highlighting for matters *expressly* relating to term vectors. *

@@ -182,7 +193,12 @@ public void testUserFailedToIndexOffsets() throws IOException { iw.close(); IndexSearcher searcher = newSearcher(ir); - UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); + UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Set getFlags(String field) { + return Collections.emptySet();//no WEIGHT_MATCHES + } + }; TermQuery query = new TermQuery(new Term("body", "vectors")); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); try { From 6098a36a008b464e946323184af8452df2395186 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 13 Aug 2018 14:29:09 -0400 Subject: [PATCH 09/14] LUCENE-8286 UH: Use MI.getSubMatches(). Removed PhraseHelper changes; not necessary anymore. Updated based on MI improvements in master. With subMatches, we have better fidelity on span queries. And since MI can handle span queries now, no need to touch PhraseHelper. * added to UHComponents: query, and highlightFlags * updated tests to handle with/without WEIGHT_MATCHES * TestUnifiedHighlighterStrictPhrases uses more randomization. Removed brittle score calculation dependence. * Test Passage matches data is in order TODO: OE freq & term() --- .../uhighlight/FieldOffsetStrategy.java | 94 ++++++------- .../search/uhighlight/NoOpOffsetStrategy.java | 4 +- .../lucene/search/uhighlight/OffsetsEnum.java | 132 +++++++++++++++++- .../search/uhighlight/PhraseHelper.java | 35 +---- .../uhighlight/TokenStreamOffsetStrategy.java | 12 +- .../search/uhighlight/UHComponents.java | 26 +++- .../search/uhighlight/UnifiedHighlighter.java | 37 ++--- .../uhighlight/TestUnifiedHighlighter.java | 67 ++++----- .../uhighlight/TestUnifiedHighlighterMTQ.java | 2 +- .../TestUnifiedHighlighterStrictPhrases.java | 128 +++++++++++------ .../TestUnifiedHighlighterExtensibility.java | 17 ++- 11 files changed, 367 insertions(+), 187 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index 7898ce148029..daf7ef4f4e40 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -51,7 +51,7 @@ public abstract class FieldOffsetStrategy { public FieldOffsetStrategy(UHComponents components) { this.components = components; this.field = components.getField(); - this.terms = components.getExtractedTerms(); + this.terms = components.getTerms(); this.phraseHelper = components.getPhraseHelper(); this.automata = components.getAutomata(); } @@ -75,57 +75,40 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc return OffsetsEnum.EMPTY; } - final List offsetsEnums = new ArrayList<>(terms.length + automata.length); - - final boolean useWeightMatchesIter = phraseHelper.useWeightMatchesIter(); - - // Handle position insensitive terms (a subset of this.terms field): - final BytesRef[] insensitiveTerms; - if (useWeightMatchesIter) { - insensitiveTerms = new BytesRef[0]; // don't -- WEIGHT_MATCHES will handle these - } else if (phraseHelper.hasPositionSensitivity()) { - insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms(); - assert insensitiveTerms.length <= terms.length : "insensitive terms should be smaller set of all terms"; - } else { - insensitiveTerms = terms; - } - if (insensitiveTerms.length > 0) { - createOffsetsEnumsForTerms(insensitiveTerms, termsIndex, doc, offsetsEnums); - } + final List offsetsEnums = new ArrayList<>(); // Handle Weight.matches approach - if (useWeightMatchesIter) { + if (components.getHighlightFlags().contains(UnifiedHighlighter.HighlightFlag.WEIGHT_MATCHES)) { + createOffsetsEnumsWeightMatcher(leafReader, doc, offsetsEnums); - } - // Handle spans - if (phraseHelper.hasPositionSensitivity()) { - phraseHelper.createOffsetsEnumsForSpans(leafReader, doc, offsetsEnums); - } + } else { // classic approach - // Handle automata - if (automata.length > 0 && useWeightMatchesIter==false) {// WEIGHT_MATCHES handled these already - createOffsetsEnumsForAutomata(termsIndex, doc, offsetsEnums); - } + // Handle position insensitive terms (a subset of this.terms field): + final BytesRef[] insensitiveTerms; + if (phraseHelper.hasPositionSensitivity()) { + insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms(); + assert insensitiveTerms.length <= terms.length : "insensitive terms should be smaller set of all terms"; + } else { + insensitiveTerms = terms; + } + if (insensitiveTerms.length > 0) { + createOffsetsEnumsForTerms(insensitiveTerms, termsIndex, doc, offsetsEnums); + } - //TODO if only one OE then return it; don't wrap in Multi. If none, return NONE - return new OffsetsEnum.MultiOffsetsEnum(offsetsEnums); - } + // Handle spans + if (phraseHelper.hasPositionSensitivity()) { + phraseHelper.createOffsetsEnumsForSpans(leafReader, doc, offsetsEnums); + } - protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsIndex, int doc, List results) throws IOException { - TermsEnum termsEnum = termsIndex.iterator();//does not return null - for (BytesRef term : sourceTerms) { - if (termsEnum.seekExact(term)) { - PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); - if (postingsEnum == null) { - // no offsets or positions available - throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); - } - if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted - results.add(new OffsetsEnum.OfPostings(term, postingsEnum)); - } + // Handle automata + if (automata.length > 0) { + createOffsetsEnumsForAutomata(termsIndex, doc, offsetsEnums); } } + + //TODO if only one OE then return it; don't wrap in Multi. If none, return NONE + return new OffsetsEnum.MultiOffsetsEnum(offsetsEnums); } protected void createOffsetsEnumsWeightMatcher(LeafReader _leafReader, int docId, List results) throws IOException { @@ -140,7 +123,8 @@ public Terms terms(String field) throws IOException { } } - //these ought to be a default? So many subclasses do this! + // So many subclasses do this! + //these ought to be a default or added via some intermediary like "FilterTransientLeafReader" (exception on close). @Override public CacheHelper getCoreCacheHelper() { return null; @@ -153,25 +137,41 @@ public CacheHelper getReaderCacheHelper() { }; IndexSearcher indexSearcher = new IndexSearcher(leafReader); indexSearcher.setQueryCache(null); - Matches matches = indexSearcher.rewrite(phraseHelper.getQuery()) + Matches matches = indexSearcher.rewrite(components.getQuery()) .createWeight(indexSearcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f) .matches(leafReader.getContext(), docId); if (matches == null) { return; // doc doesn't match } for (String field : matches) { - if (phraseHelper.getFieldMatcher().test(field)) { + if (components.getFieldMatcher().test(field)) { MatchesIterator iterator = matches.getMatches(field); if (iterator == null) { continue; } //note: the iterator should be positioned on the first one already - results.add(new OffsetsEnum.OfMatchesIterator(iterator)); + results.add(new OffsetsEnum.OfMatchesIteratorWithSubs(iterator)); } } } + protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsIndex, int doc, List results) throws IOException { + TermsEnum termsEnum = termsIndex.iterator();//does not return null + for (BytesRef term : sourceTerms) { + if (termsEnum.seekExact(term)) { + PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); + if (postingsEnum == null) { + // no offsets or positions available + throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + } + if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted + results.add(new OffsetsEnum.OfPostings(term, postingsEnum)); + } + } + } + } + protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List results) throws IOException { List> automataPostings = new ArrayList<>(automata.length); for (int i = 0; i < automata.length; i++) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java index e42ff2353f1e..80528ce56123 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/NoOpOffsetStrategy.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; +import java.util.Collections; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -32,7 +34,7 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy { public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy(); private NoOpOffsetStrategy() { - super(new UHComponents("_ignored_", (s) -> false, new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0])); + super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], Collections.emptySet())); } @Override diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java index 906117247c2f..14f8875269bc 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java @@ -94,8 +94,14 @@ public void close() throws IOException { @Override public String toString() { final String name = getClass().getSimpleName(); + String offset = ""; try { - return name + "(term:" + getTerm().utf8ToString() +")"; + offset = ",[" + startOffset() + "-" + endOffset() + "]"; + } catch (Exception e) { + //ignore; for debugging only + } + try { + return name + "(term:" + getTerm().utf8ToString() + offset + ")"; } catch (Exception e) { return name; } @@ -158,6 +164,130 @@ public int freq() throws IOException { } } + /** Based on a {@link MatchesIterator} with submatches. */ + public static class OfMatchesIteratorWithSubs extends OffsetsEnum { + //Either CachedOE impls (which are the submatches) or OfMatchesIterator impls + private final PriorityQueue pendingQueue = new PriorityQueue<>(); + + public OfMatchesIteratorWithSubs(MatchesIterator matchesIterator) { + pendingQueue.add(new OfMatchesIterator(matchesIterator)); + } + + @Override + public boolean nextPosition() throws IOException { + OffsetsEnum formerHeadOE = pendingQueue.poll(); // removes the head + if (formerHeadOE instanceof CachedOE) { + // we're done with the former head. CachedOE's are one use only. + // Look at the new head... + OffsetsEnum newHeadOE = pendingQueue.peek(); + if (newHeadOE instanceof OfMatchesIterator) { + // We found the matchesIterator. Requires processing. + nextWhenMatchesIterator((OfMatchesIterator) newHeadOE); // May or may not remove or re-queue itself + } // else new head is a CacheOE or no more. Nothing to do with it. + + } else { // formerHeadOE is OfMatchesIterator; advance it + OfMatchesIterator miOE = (OfMatchesIterator) formerHeadOE; + if (miOE.nextPosition()) { + nextWhenMatchesIterator(miOE); // requires processing. May or may not re-enqueue itself + } + } + return pendingQueue.isEmpty() == false; + } + + private void nextWhenMatchesIterator(OfMatchesIterator miOE) throws IOException { + boolean isHead = miOE == pendingQueue.peek(); + MatchesIterator subMatches = miOE.matchesIterator.getSubMatches(); + if (subMatches != null) { + // remove this miOE from the queue, add it's submatches, next() it, then re-enqueue it + if (isHead) { + pendingQueue.poll(); // remove + } + + enqueueCachedMatches(subMatches); + + if (miOE.nextPosition()) { + pendingQueue.add(miOE); + assert pendingQueue.peek() != miOE; // miOE should follow cached entries + } + + } else { // else has no subMatches. It will stay enqueued. + if (!isHead) { + pendingQueue.add(miOE); + } // else it's *already* in pendingQueue + } + } + + private boolean enqueueCachedMatches(MatchesIterator thisMI) throws IOException { + if (thisMI == null) { + return false; + } else { + while (thisMI.next()) { + if (false == enqueueCachedMatches(thisMI.getSubMatches())) { // recursion + // if no sub-matches then add ourselves + pendingQueue.add(new CachedOE(thisMI.startOffset(), thisMI.endOffset())); + } + } + return true; + } + } + + @Override + public int freq() throws IOException { + return pendingQueue.peek().freq(); + } + + @Override + public BytesRef getTerm() throws IOException { + return pendingQueue.peek().getTerm(); + } + + @Override + public int startOffset() throws IOException { + return pendingQueue.peek().startOffset(); + } + + @Override + public int endOffset() throws IOException { + return pendingQueue.peek().endOffset(); + } + + private static class CachedOE extends OffsetsEnum { + final int startOffset; + final int endOffset; + + private CachedOE(int startOffset, int endOffset) { + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public boolean nextPosition() throws IOException { + return false; + } + + @Override + public int freq() throws IOException { + return 1; // nocommit terrible + } + + @Override + public BytesRef getTerm() throws IOException { + return new BytesRef(); // nocommit terrible + } + + @Override + public int startOffset() throws IOException { + return startOffset; + } + + @Override + public int endOffset() throws IOException { + return endOffset; + } + } + } + + /** Based on a {@link MatchesIterator}; does not look at submatches. */ public static class OfMatchesIterator extends OffsetsEnum { private final MatchesIterator matchesIterator; diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java index dd00b3d7d008..ab439d09a2bc 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java @@ -38,9 +38,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; @@ -67,15 +65,13 @@ public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", - (s) -> false, spanQuery -> null, query -> null, true, false); + (s) -> false, spanQuery -> null, query -> null, true); - private final Query query; private final String fieldName; private final Set positionInsensitiveTerms; // (TermQuery terms) private final Set spanQueries; private final boolean willRewrite; private final Predicate fieldMatcher; - private final boolean useWeightMatchesIter; /** * Constructor. @@ -90,18 +86,13 @@ public class PhraseHelper { */ public PhraseHelper(Query query, String field, Predicate fieldMatcher, Function rewriteQueryPred, Function> preExtractRewriteFunction, - boolean ignoreQueriesNeedingRewrite, boolean useWeightMatchesIter) { - this.query = query; + boolean ignoreQueriesNeedingRewrite) { this.fieldName = field; this.fieldMatcher = fieldMatcher; - this.useWeightMatchesIter = useWeightMatchesIter; // filter terms to those we want positionInsensitiveTerms = new HashSet<>(); spanQueries = new HashSet<>(); - // if useWeightMatchesIter: - // TODO once SpanQueries implement Weight.matches, we needn't capture them here. But note SpanMultiTermQueryWrapper - // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class @@ -141,8 +132,6 @@ protected void extract(Query query, float boost, Map t for (Query newQuery : newQueriesToExtract) { extract(newQuery, boost, terms); } - } else if (useWeightMatchesIter && (query instanceof PhraseQuery || query instanceof MultiPhraseQuery)) { - return;// do nothing; these queries have Weight impls with proper getMulti } else { super.extract(query, boost, terms); } @@ -160,10 +149,8 @@ protected boolean isQueryUnsupported(Class clazz) { @Override protected void extractWeightedTerms(Map terms, Query query, float boost) throws IOException { - if (!useWeightMatchesIter) { // when useWeightMatchesIterator, we needn't track these terms - query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, ScoreMode.COMPLETE_NO_SCORES, boost) - .extractTerms(extractPosInsensitiveTermsTarget); - } + query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, ScoreMode.COMPLETE_NO_SCORES, boost) + .extractTerms(extractPosInsensitiveTermsTarget); } // called on SpanQueries. Some other position-sensitive queries like PhraseQuery are converted beforehand @@ -198,18 +185,6 @@ protected boolean mustRewriteQuery(SpanQuery spanQuery) { willRewrite = mustRewriteHolder[0]; } - public Query getQuery() { - return query; - } - - public boolean useWeightMatchesIter() { - return useWeightMatchesIter; - } - - public Predicate getFieldMatcher() { - return fieldMatcher; - } - public Set getSpanQueries() { return spanQueries; } @@ -230,7 +205,7 @@ public boolean willRewrite() { return willRewrite; } - /** Returns the terms that are position-insensitive (sorted). Always empty if {@link #useWeightMatchesIter}. */ + /** Returns the terms that are position-insensitive (sorted). */ public BytesRef[] getAllPositionInsensitiveTerms() { BytesRef[] result = positionInsensitiveTerms.toArray(new BytesRef[positionInsensitiveTerms.size()]); Arrays.sort(result); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java index e851ca660cdd..e829769909ef 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java @@ -38,11 +38,13 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) { super(new UHComponents( - components.getField(), - components.getFieldMatcher(), - ZERO_LEN_BYTES_REF_ARRAY, - components.getPhraseHelper(), - convertTermsToAutomata(components.getExtractedTerms(), components.getAutomata())), + components.getField(), + components.getFieldMatcher(), + components.getQuery(), + ZERO_LEN_BYTES_REF_ARRAY, + components.getPhraseHelper(), + convertTermsToAutomata(components.getTerms(), components.getAutomata()), + components.getHighlightFlags()), indexAnalyzer); assert phraseHelper.hasPositionSensitivity() == false; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java index a8ea1c0273cf..9719fab27485 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.uhighlight; +import java.util.Set; import java.util.function.Predicate; +import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -30,16 +32,22 @@ public class UHComponents { private final String field; private final Predicate fieldMatcher; - private final BytesRef[] extractedTerms; + private final Query query; + private final BytesRef[] terms; private final PhraseHelper phraseHelper; private final CharacterRunAutomaton[] automata; + private final Set highlightFlags; - public UHComponents(String field, Predicate fieldMatcher, BytesRef[] extractedTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { + public UHComponents(String field, Predicate fieldMatcher, Query query, + BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, + Set highlightFlags) { this.field = field; this.fieldMatcher = fieldMatcher; - this.extractedTerms = extractedTerms; + this.query = query; + this.terms = terms; this.phraseHelper = phraseHelper; this.automata = automata; + this.highlightFlags = highlightFlags; } public String getField() { @@ -50,8 +58,12 @@ public Predicate getFieldMatcher() { return fieldMatcher; } - public BytesRef[] getExtractedTerms() { - return extractedTerms; + public Query getQuery() { + return query; + } + + public BytesRef[] getTerms() { + return terms; } public PhraseHelper getPhraseHelper() { @@ -61,4 +73,8 @@ public PhraseHelper getPhraseHelper() { public CharacterRunAutomaton[] getAutomata() { return automata; } + + public Set getHighlightFlags() { + return highlightFlags; + } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index d55ec3fefefa..4e4d81528733 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -752,13 +752,15 @@ public Object highlightWithoutSearcher(String field, Query query, String content } protected FieldHighlighter getFieldHighlighter(String field, Query query, Set allTerms, int maxPassages) { - BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms); + Predicate fieldMatcher = getFieldMatcher(field); + BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms); Set highlightFlags = getFlags(field); PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags); OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata); + UHComponents components = new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags); return new FieldHighlighter(field, - getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags), + getOffsetStrategy(offsetSource, components), new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR), getScorer(field), maxPassages, @@ -785,9 +787,6 @@ protected Set getFlags(String field) { if (shouldHighlightPhrasesStrictly(field)) { highlightFlags.add(HighlightFlag.PHRASES); } - if (highlightFlags.contains(HighlightFlag.PHRASES) && highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)) { - highlightFlags.add(HighlightFlag.WEIGHT_MATCHES);//nocommit for experimentation, use as default - } if (shouldPreferPassageRelevancyOverSpeed(field)) { highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED); } @@ -795,21 +794,30 @@ protected Set getFlags(String field) { } protected PhraseHelper getPhraseHelper(String field, Query query, Set highlightFlags) { + boolean useWeightMatchesIter = highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES); + if (useWeightMatchesIter) { + return PhraseHelper.NONE; // will be handled by Weight.matches which always considers phrases + } boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES); boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY); - boolean useWeightMatchesIter = highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES); return highlightPhrasesStrictly ? new PhraseHelper(query, field, getFieldMatcher(field), this::requiresRewrite, this::preSpanQueryRewrite, - !handleMultiTermQuery, - useWeightMatchesIter && handleMultiTermQuery) + !handleMultiTermQuery + ) : PhraseHelper.NONE; } protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set highlightFlags) { + // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those? + // if don't highlight phrases strictly, + final boolean lookInSpan = + !highlightFlags.contains(HighlightFlag.PHRASES) // no PhraseHelper + || highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES); // Weight.Matches will find all + return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY) - ? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), !highlightFlags.contains(HighlightFlag.PHRASES), this::preMultiTermQueryRewrite) + ? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), lookInSpan, this::preMultiTermQueryRewrite) : ZERO_LEN_AUTOMATA_ARRAY; } @@ -847,15 +855,12 @@ protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, return offsetSource; } - protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, - PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, - Set highlightFlags) { - final UHComponents components = new UHComponents(field, getFieldMatcher(field), terms, phraseHelper, automata); + protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, UHComponents components) { switch (offsetSource) { case ANALYSIS: - if (!phraseHelper.hasPositionSensitivity() && - !highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED) && - !highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES)) { + if (!components.getPhraseHelper().hasPositionSensitivity() && + !components.getHighlightFlags().contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED) && + !components.getHighlightFlags().contains(HighlightFlag.WEIGHT_MATCHES)) { //skip using a memory index since it's pure term filtering return new TokenStreamOffsetStrategy(components, getIndexAnalyzer()); } else { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index d89499cd5e83..c9435a04471d 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -86,42 +86,39 @@ public void doAfter() throws IOException { } static UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer) { - return randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.noneOf(HighlightFlag.class)); + return randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.noneOf(HighlightFlag.class), null); } static UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer, - EnumSet mandatoryFlags) { - if (random().nextBoolean()) { - return new UnifiedHighlighter(searcher, indexAnalyzer); - } else { - final UnifiedHighlighter uh = new UnifiedHighlighter(searcher, indexAnalyzer) { - Set flags; // consistently random set of flags for this test run - @Override - protected Set getFlags(String field) { - if (flags != null) { - return flags; - } - final EnumSet result = EnumSet.copyOf(mandatoryFlags); - int r = random().nextInt(); - for (HighlightFlag highlightFlag : HighlightFlag.values()) { - if (((1 << highlightFlag.ordinal()) & r) == 0) { - result.add(highlightFlag); - } - } - if (result.contains(HighlightFlag.WEIGHT_MATCHES)) { - // these two are required for WEIGHT_MATCHES - result.add(HighlightFlag.MULTI_TERM_QUERY); - result.add(HighlightFlag.PHRASES); + EnumSet mandatoryFlags, Boolean requireFieldMatch) { + final UnifiedHighlighter uh = new UnifiedHighlighter(searcher, indexAnalyzer) { + Set flags; // consistently random set of flags for this test run + @Override + protected Set getFlags(String field) { + if (flags != null) { + return flags; + } + final EnumSet result = EnumSet.copyOf(mandatoryFlags); + int r = random().nextInt(); + for (HighlightFlag highlightFlag : HighlightFlag.values()) { + if (((1 << highlightFlag.ordinal()) & r) == 0) { + result.add(highlightFlag); } - return flags = result; } - }; - uh.setCacheFieldValCharsThreshold(random().nextInt(100)); - if (random().nextBoolean()) { - uh.setFieldMatcher(f -> true); // requireFieldMatch==false + if (result.contains(HighlightFlag.WEIGHT_MATCHES)) { + // these two are required for WEIGHT_MATCHES + result.add(HighlightFlag.MULTI_TERM_QUERY); + result.add(HighlightFlag.PHRASES); + } + return flags = result; } - return uh; + }; + uh.setCacheFieldValCharsThreshold(random().nextInt(100)); + if (requireFieldMatch == Boolean.FALSE || (requireFieldMatch == null && random().nextBoolean())) { + uh.setFieldMatcher(f -> true); // requireFieldMatch==false } + return uh; + } // @@ -1126,7 +1123,7 @@ protected Predicate getFieldMatcher(String field) { return (qf) -> true; } }; - UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.MULTI_TERM_QUERY)); + UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.MULTI_TERM_QUERY), null); highlighterFieldMatch.setFieldMatcher(null);//default BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder() @@ -1206,10 +1203,14 @@ public void testMatchesSlopBug() throws IOException { UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); Query query = new PhraseQuery(2, "title", "this", "is", "the", "field"); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertEquals(1, topDocs.totalHits); + assertEquals(1, topDocs.totalHits.value); String[] snippets = highlighter.highlight("title", query, topDocs, 10); assertEquals(1, snippets.length); - assertEquals("This is the title field.", snippets[0]); + if (highlighter.getFlags("title").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertEquals("This is the title field.", snippets[0]); + } else { + assertEquals("This is the title field.", snippets[0]); + } ir.close(); } @@ -1223,7 +1224,7 @@ protected Predicate getFieldMatcher(String field) { return (qf) -> true; } }; - UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.PHRASES, HighlightFlag.MULTI_TERM_QUERY)); + UnifiedHighlighter highlighterFieldMatch = randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.of(HighlightFlag.PHRASES, HighlightFlag.MULTI_TERM_QUERY), null); highlighterFieldMatch.setFieldMatcher(null);//default BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder() diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java index 2f97ad4eeb05..44d6f7b0d686 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java @@ -154,7 +154,7 @@ public void testWildcards() throws Exception { private UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer) { return TestUnifiedHighlighter.randomUnifiedHighlighter(searcher, indexAnalyzer, - EnumSet.of(HighlightFlag.MULTI_TERM_QUERY)); + EnumSet.of(HighlightFlag.MULTI_TERM_QUERY), null); } public void testOnePrefix() throws Exception { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java index ee9b9d224a63..a6d5d01566ed 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java @@ -17,10 +17,12 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; -import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; +import java.util.Locale; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.MockAnalyzer; @@ -73,6 +75,9 @@ public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase { UnifiedHighlighter highlighter; IndexReader indexReader; + // Is it okay if a match (identified by offset pair) appears multiple times in the passage? + AtomicBoolean dupMatchAllowed = new AtomicBoolean(true); + @ParametersFactory public static Iterable parameters() { return UHTestHelper.parametersFactoryList(); @@ -106,8 +111,32 @@ private Document newDoc(String... bodyVals) { private void initReaderSearcherHighlighter() throws IOException { indexReader = indexWriter.getReader(); searcher = newSearcher(indexReader); - highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); - highlighter.setHighlightPhrasesStrictly(true); + highlighter = TestUnifiedHighlighter.randomUnifiedHighlighter(searcher, indexAnalyzer, + EnumSet.of(HighlightFlag.PHRASES, HighlightFlag.MULTI_TERM_QUERY), true); + // intercept the formatter in order to check constraints on the passage. + final PassageFormatter defaultFormatter = highlighter.getFormatter(null); + highlighter.setFormatter(new PassageFormatter() { + @Override + public Object format(Passage[] passages, String content) { + boolean thisDupMatchAllowed = dupMatchAllowed.getAndSet(true); + for (Passage passage : passages) { + String prevPair = ""; + for (int i = 0; i < passage.getNumMatches(); i++) { + // pad each to make comparable + String pair = String.format(Locale.ROOT, "%03d-%03d", passage.getMatchStarts()[i], passage.getMatchEnds()[i]); + int cmp = prevPair.compareTo(pair); + if (cmp == 0) { + assertTrue("dup match in passage at offset " + pair, thisDupMatchAllowed); + } else if (cmp > 0) { + fail("bad match order in passage at offset " + pair); + } + prevPair = pair; + } + + } + return defaultFormatter.format(passages, content); + } + }); } private PhraseQuery newPhraseQuery(String field, String phrase) { @@ -139,8 +168,11 @@ public void testBasics() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - - assertArrayEquals(new String[]{"yin alone, Yin yang, yin gap yang"}, snippets); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"yin alone, Yin yang, yin gap yang"}, snippets); + } else { + assertArrayEquals(new String[]{"yin alone, Yin yang, yin gap yang"}, snippets); + } } public void testWithSameTermQuery() throws IOException { @@ -155,19 +187,13 @@ public void testWithSameTermQuery() throws IOException { .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + dupMatchAllowed.set(false); // We don't want duplicates from "Yin" being in TermQuery & PhraseQuery. String[] snippets = highlighter.highlight("body", query, topDocs); - - assertArrayEquals(new String[]{"Yin yang, yin gap yang"}, snippets); - - // test the Passage only has 3 matches. We don't want duplicates from "Yin" being in TermQuery & PhraseQuery. - highlighter.setFormatter(new PassageFormatter() { - @Override - public Object format(Passage[] passages, String content) { - return Arrays.toString(passages); - } - }); - assertArrayEquals(new String[]{"[Passage[0-22]{yin[0-3],yang[4-8],yin[10-13]}score=2.4964213]"}, - highlighter.highlight("body", query, topDocs)); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"Yin yang, yin gap yang"}, snippets); + } else { + assertArrayEquals(new String[]{"Yin yang, yin gap yang"}, snippets); + } } public void testPhraseNotInDoc() throws IOException { @@ -199,19 +225,14 @@ public void testSubPhrases() throws IOException { .build(); TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + dupMatchAllowed.set(false); // We don't want duplicates from both PhraseQuery String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); - - // test the Passage only has 3 matches. We don't want duplicates from both PhraseQuery - highlighter.setFormatter(new PassageFormatter() { - @Override - public Object format(Passage[] passages, String content) { - return Arrays.toString(passages); - } - }); - assertArrayEquals(new String[]{"[Passage[0-41]{alpha[0-5],bravo[6-11],charlie[12-19]}score=3.931102]"}, - highlighter.highlight("body", query, topDocs)); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } else { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } } public void testSynonyms() throws IOException { @@ -225,8 +246,11 @@ public void testSynonyms() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - - assertArrayEquals(new String[]{"mother father w mom father w dad"}, snippets); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"mother father w mom father w dad"}, snippets); + } else { + assertArrayEquals(new String[]{"mother father w mom father w dad"}, snippets); + } } /** @@ -254,10 +278,14 @@ public void testRewriteAndMtq() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, - snippets); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } else { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } // do again, this time with MTQ disabled. We should only find "alpha bravo". + highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); highlighter.setHandleMultiTermQuery(false);//disable but leave phrase processing enabled topDocs = searcher.search(query, 10, Sort.INDEXORDER); @@ -290,10 +318,14 @@ public void testRewrite() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, - snippets); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } else { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } // do again, this time with MTQ disabled. We should only find "alpha bravo". + highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); highlighter.setHandleMultiTermQuery(false);//disable but leave phrase processing enabled topDocs = searcher.search(query, 10, Sort.INDEXORDER); @@ -327,10 +359,14 @@ public void testMtq() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, - snippets); + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } else { + assertArrayEquals(new String[]{"alpha bravo charlie - charlie bravo alpha"}, snippets); + } // do again, this time with MTQ disabled. + highlighter = new UnifiedHighlighter(searcher, indexAnalyzer); highlighter.setHandleMultiTermQuery(false);//disable but leave phrase processing enabled topDocs = searcher.search(query, 10, Sort.INDEXORDER); @@ -354,9 +390,11 @@ public void testMultiValued() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs, 2); - assertArrayEquals(new String[]{"one bravo three... four bravo six"}, - snippets); - + if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { + assertArrayEquals(new String[]{"one bravo three... four bravo six"}, snippets); + } else { + assertArrayEquals(new String[]{"one bravo three... four bravo six"}, snippets); + } // now test phraseQuery won't span across values assert indexAnalyzer.getPositionIncrementGap("body") > 0; @@ -405,10 +443,15 @@ public void testMaxLen() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); - if (fieldType == UHTestHelper.reanalysisType || highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) { - assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); + final boolean weightMatches = highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES); + if (fieldType == UHTestHelper.reanalysisType || weightMatches) { + if (weightMatches) { + assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); + } else { + assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); + } } else { - assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); + assertArrayEquals(new String[]{"alpha bravo charlie -"}, snippets); } } @@ -427,6 +470,7 @@ public void testFilteredOutSpan() throws IOException { TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); String[] snippets = highlighter.highlight("body", query, topDocs); + // spans' MatchesIterator exposes each underlying term; thus doesn't enclose intermediate "of" assertArrayEquals(new String[]{"freezing cold stuff like stuff freedom of speech"}, snippets); } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java index ca21000137dc..52a790cbed5a 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java @@ -19,9 +19,11 @@ import java.io.IOException; import java.text.BreakIterator; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Predicate; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -29,6 +31,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.uhighlight.FieldHighlighter; @@ -60,9 +63,9 @@ public void testFieldOffsetStrategyExtensibility() { final UnifiedHighlighter.OffsetSource offsetSource = UnifiedHighlighter.OffsetSource.NONE_NEEDED; FieldOffsetStrategy strategy = new FieldOffsetStrategy(new UHComponents("field", (s) -> false, - new BytesRef[0], + new MatchAllDocsQuery(), new BytesRef[0], PhraseHelper.NONE, - new CharacterRunAutomaton[0])) { + new CharacterRunAutomaton[0], Collections.emptySet())) { @Override public UnifiedHighlighter.OffsetSource getOffsetSource() { return offsetSource; @@ -149,13 +152,15 @@ protected List loadFieldValues(String[] fields, DocIdSetIterator @Override protected FieldHighlighter getFieldHighlighter(String field, Query query, Set allTerms, int maxPassages) { // THIS IS A COPY of the superclass impl; but use CustomFieldHighlighter - BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms); + Predicate fieldMatcher = getFieldMatcher(field); + BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms); Set highlightFlags = getFlags(field); PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags); OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata); + UHComponents components = new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags); return new CustomFieldHighlighter(field, - getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags), + getOffsetStrategy(offsetSource, components), new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR), getScorer(field), maxPassages, @@ -164,8 +169,8 @@ protected FieldHighlighter getFieldHighlighter(String field, Query query, Set highlightFlags) { - return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags); + protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, UHComponents components) { + return super.getOffsetStrategy(offsetSource, components); } @Override From 39a5c6243b09db36078cfd0772f44b875bf885aa Mon Sep 17 00:00:00 2001 From: David Smiley Date: Sat, 18 Aug 2018 23:18:05 -0400 Subject: [PATCH 10/14] LUCENE-8286 UH: Add highlight test showing we fix LUCENE-7903 --- .../uhighlight/TestUnifiedHighlighter.java | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index c9435a04471d..301788b7700f 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -1326,4 +1326,34 @@ protected Predicate getFieldMatcher(String field) { } ir.close(); } + + // LUCENE-7909 + public void testNestedBooleanQueryAccuracy() throws IOException { + IndexReader ir = indexSomeFields(); + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer, + EnumSet.of(HighlightFlag.WEIGHT_MATCHES), true); + + // This query contains an inner Boolean of two MUST clauses (== "AND"). Only one of them is + // actually in the data, the other is not. We should highlight neither. We can highlight the outer + // SHOULD clauses though. + Query query = new BooleanQuery.Builder() + .add(new PhraseQuery("title", "title", "field"), BooleanClause.Occur.SHOULD) + .add(new BooleanQuery.Builder() + .add(new TermQuery(new Term("category", "category")), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("category", "nonexistent")), BooleanClause.Occur.MUST) + .build(), BooleanClause.Occur.SHOULD) + .build(); + + TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + + String[] snippets = highlighter.highlight("title", query, topDocs); + assertArrayEquals(new String[]{"This is the title field."}, snippets); + + // no highlights, not of "category" since "nonexistent" wasn't there + snippets = highlighter.highlight("category", query, topDocs); + assertArrayEquals(new String[]{"This is the category field."}, snippets); + + ir.close(); + } } From bbeeb44716ed3ec46683c6d5f038ea5003ac6dee Mon Sep 17 00:00:00 2001 From: David Smiley Date: Sun, 26 Aug 2018 10:00:05 -0400 Subject: [PATCH 11/14] LUCENE-8286: UH Implement OffsetsEnum.getTerm for MatchesIterator impls --- .../lucene/search/uhighlight/OffsetsEnum.java | 65 ++++++++++++++++--- .../lucene/search/uhighlight/Passage.java | 11 +++- .../TestUnifiedHighlighterRanking.java | 20 ++++++ 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java index 14f8875269bc..b73087e47a73 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java @@ -19,13 +19,20 @@ import java.io.Closeable; import java.io.IOException; +import java.util.HashMap; import java.util.List; import java.util.Objects; import java.util.PriorityQueue; +import java.util.TreeSet; +import java.util.function.Supplier; import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; /** @@ -78,8 +85,10 @@ public int compareTo(OffsetsEnum other) { public abstract int freq() throws IOException; /** - * The term at this position; usually always the same. + * The term at this position. * This BytesRef is safe to continue to refer to, even after we move to the next position. + * + * @see {@link Passage#getMatchTerms()}. */ public abstract BytesRef getTerm() throws IOException; @@ -168,9 +177,10 @@ public int freq() throws IOException { public static class OfMatchesIteratorWithSubs extends OffsetsEnum { //Either CachedOE impls (which are the submatches) or OfMatchesIterator impls private final PriorityQueue pendingQueue = new PriorityQueue<>(); + private final HashMap queryToTermMap = new HashMap<>(); public OfMatchesIteratorWithSubs(MatchesIterator matchesIterator) { - pendingQueue.add(new OfMatchesIterator(matchesIterator)); + pendingQueue.add(new OfMatchesIterator(matchesIterator, () -> queryToTerm(matchesIterator.getQuery()))); } @Override @@ -224,13 +234,46 @@ private boolean enqueueCachedMatches(MatchesIterator thisMI) throws IOException while (thisMI.next()) { if (false == enqueueCachedMatches(thisMI.getSubMatches())) { // recursion // if no sub-matches then add ourselves - pendingQueue.add(new CachedOE(thisMI.startOffset(), thisMI.endOffset())); + pendingQueue.add(new CachedOE(queryToTerm(thisMI.getQuery()), thisMI.startOffset(), thisMI.endOffset())); } } return true; } } + /** Maps a Query from {@link MatchesIterator#getQuery()} to {@link OffsetsEnum#getTerm()}. + * See {@link Passage#getMatchTerms()}. */ + private BytesRef queryToTerm(Query query) { + // compute an approximate BytesRef term of a Query. We cache this since we're likely to see the same query again. + // Our approach is to call extractTerms and visit each term in order, concatenating them with an adjoining space. + // If we don't have any (perhaps due to an MTQ like a wildcard) then we fall back on the toString() of the query. + return queryToTermMap.computeIfAbsent(query, (Query q) -> { + try { + BytesRefBuilder bytesRefBuilder = new BytesRefBuilder(); + UnifiedHighlighter.EMPTY_INDEXSEARCHER + .createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER.rewrite(q), ScoreMode.COMPLETE_NO_SCORES, 1f) + .extractTerms(new TreeSet() { + @Override + public boolean add(Term term) { + if (bytesRefBuilder.length() > 0) { + bytesRefBuilder.append((byte) ' '); + } + bytesRefBuilder.append(term.bytes()); + return true; + } + }); + if (bytesRefBuilder.length() > 0) { + return bytesRefBuilder.get(); + } + } catch (IOException e) {//ignore + // go to fallback... + } + + // fallback: (likely a MultiTermQuery) + return new BytesRef(q.toString()); + }); + } + @Override public int freq() throws IOException { return pendingQueue.peek().freq(); @@ -252,10 +295,12 @@ public int endOffset() throws IOException { } private static class CachedOE extends OffsetsEnum { + final BytesRef term; final int startOffset; final int endOffset; - private CachedOE(int startOffset, int endOffset) { + private CachedOE(BytesRef term, int startOffset, int endOffset) { + this.term = term; this.startOffset = startOffset; this.endOffset = endOffset; } @@ -267,12 +312,12 @@ public boolean nextPosition() throws IOException { @Override public int freq() throws IOException { - return 1; // nocommit terrible + return 1; // documented short-coming of MatchesIterator based UnifiedHighlighter } @Override public BytesRef getTerm() throws IOException { - return new BytesRef(); // nocommit terrible + return term; } @Override @@ -290,9 +335,11 @@ public int endOffset() throws IOException { /** Based on a {@link MatchesIterator}; does not look at submatches. */ public static class OfMatchesIterator extends OffsetsEnum { private final MatchesIterator matchesIterator; + private final Supplier termSupplier; - public OfMatchesIterator(MatchesIterator matchesIterator) { + public OfMatchesIterator(MatchesIterator matchesIterator, Supplier termSupplier) { this.matchesIterator = matchesIterator; + this.termSupplier = termSupplier; } @Override @@ -302,12 +349,12 @@ public boolean nextPosition() throws IOException { @Override public int freq() throws IOException { - return 1; // nocommit terrible + return 1; // documented short-coming of MatchesIterator based UnifiedHighlighter } @Override public BytesRef getTerm() throws IOException { - return new BytesRef(); // nocommit terrible + return termSupplier.get(); } @Override diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java index 570fcdf666f7..cbcde9e0bde1 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java @@ -157,7 +157,16 @@ public int[] getMatchEnds() { } /** - * BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}. + * BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}. The primary purpose of this + * method is to calculate the number of unique terms per passage for use in passage scoring. + * The actual term byte content is not well defined by this highlighter, and thus use of it is more subject to + * change. + *

+ * The term might be simply the analyzed term at this position. + * Depending on the highlighter's configuration, the match term may be a phrase (instead of a word), and in such + * a case might be a series of space-separated analyzed terms. + * If the match is from a {@link org.apache.lucene.search.MultiTermQuery} then the match term may be the toString() of + * that query. *

* Only {@link #getNumMatches()} are valid. */ diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java index ac7e9bc90f1a..2ddd2eb6dd30 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; +import java.util.EnumSet; import java.util.HashSet; import java.util.Random; +import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -274,6 +276,15 @@ public void testCustomB() throws Exception { IndexSearcher searcher = newSearcher(ir); UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Set getFlags(String field) { + if (random().nextBoolean()) { + return EnumSet.of(HighlightFlag.MULTI_TERM_QUERY, HighlightFlag.PHRASES, HighlightFlag.WEIGHT_MATCHES); + } else { + return super.getFlags(field); + } + } + @Override protected PassageScorer getScorer(String field) { return new PassageScorer(1.2f, 0, 87); @@ -314,6 +325,15 @@ public void testCustomK1() throws Exception { IndexSearcher searcher = newSearcher(ir); UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Set getFlags(String field) { + if (random().nextBoolean()) { + return EnumSet.of(HighlightFlag.MULTI_TERM_QUERY, HighlightFlag.PHRASES, HighlightFlag.WEIGHT_MATCHES); + } else { + return super.getFlags(field); + } + } + @Override protected PassageScorer getScorer(String field) { return new PassageScorer(0, 0.75f, 87); From 376ac79c9129fca8169e1b9e9f649221735b01a7 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 27 Aug 2018 00:59:02 -0400 Subject: [PATCH 12/14] LUCENE-8286: UH use one UHComponents field in FieldOffsetStrategy instead of its parts. Resolved nocommits. --- .../uhighlight/AnalysisOffsetStrategy.java | 10 ++++---- .../uhighlight/FieldOffsetStrategy.java | 20 ++++++---------- .../uhighlight/MemoryIndexOffsetStrategy.java | 23 ++++++++----------- .../lucene/search/uhighlight/OffsetsEnum.java | 2 +- ...PostingsWithTermVectorsOffsetStrategy.java | 4 ++-- .../uhighlight/TermVectorOffsetStrategy.java | 6 ++--- .../uhighlight/TokenStreamOffsetStrategy.java | 4 ++-- .../search/uhighlight/UHComponents.java | 6 ++--- .../uhighlight/TestUnifiedHighlighter.java | 3 ++- 9 files changed, 34 insertions(+), 44 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java index 6b0b60804862..63785e35f8a0 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java @@ -38,9 +38,9 @@ public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy { public AnalysisOffsetStrategy(UHComponents components, Analyzer analyzer) { super(components); this.analyzer = analyzer; - if (analyzer.getOffsetGap(field) != 1) { // note: 1 is the default. It is RARELY changed. + if (analyzer.getOffsetGap(getField()) != 1) { // note: 1 is the default. It is RARELY changed. throw new IllegalArgumentException( - "offset gap of the provided analyzer should be 1 (field " + field + ")"); + "offset gap of the provided analyzer should be 1 (field " + getField() + ")"); } } @@ -53,12 +53,12 @@ protected TokenStream tokenStream(String content) throws IOException { // If there is no splitChar in content then we needn't wrap: int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR); if (splitCharIdx == -1) { - return analyzer.tokenStream(field, content); + return analyzer.tokenStream(getField(), content); } - TokenStream subTokenStream = analyzer.tokenStream(field, content.substring(0, splitCharIdx)); + TokenStream subTokenStream = analyzer.tokenStream(getField(), content.substring(0, splitCharIdx)); - return new MultiValueTokenStream(subTokenStream, field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx); + return new MultiValueTokenStream(subTokenStream, getField(), analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx); } /** diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index daf7ef4f4e40..27d9318bae36 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -42,22 +42,13 @@ public abstract class FieldOffsetStrategy { protected final UHComponents components; - //nocommit remove these?: - protected final String field; - protected final PhraseHelper phraseHelper; // Query: position-sensitive information - protected final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive) - protected final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive public FieldOffsetStrategy(UHComponents components) { this.components = components; - this.field = components.getField(); - this.terms = components.getTerms(); - this.phraseHelper = components.getPhraseHelper(); - this.automata = components.getAutomata(); } public String getField() { - return field; + return components.getField(); } public abstract UnifiedHighlighter.OffsetSource getOffsetSource(); @@ -70,7 +61,7 @@ public String getField() { public abstract OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException; protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc) throws IOException { - final Terms termsIndex = leafReader.terms(field); + final Terms termsIndex = leafReader.terms(getField()); if (termsIndex == null) { return OffsetsEnum.EMPTY; } @@ -86,6 +77,8 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc // Handle position insensitive terms (a subset of this.terms field): final BytesRef[] insensitiveTerms; + final PhraseHelper phraseHelper = components.getPhraseHelper(); + final BytesRef[] terms = components.getTerms(); if (phraseHelper.hasPositionSensitivity()) { insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms(); assert insensitiveTerms.length <= terms.length : "insensitive terms should be smaller set of all terms"; @@ -102,7 +95,7 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc } // Handle automata - if (automata.length > 0) { + if (components.getAutomata().length > 0) { createOffsetsEnumsForAutomata(termsIndex, doc, offsetsEnums); } } @@ -163,7 +156,7 @@ protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsInd PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); if (postingsEnum == null) { // no offsets or positions available - throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + throw new IllegalArgumentException("field '" + getField() + "' was indexed without offsets, cannot highlight"); } if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted results.add(new OffsetsEnum.OfPostings(term, postingsEnum)); @@ -173,6 +166,7 @@ protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsInd } protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List results) throws IOException { + final CharacterRunAutomaton[] automata = components.getAutomata(); List> automataPostings = new ArrayList<>(automata.length); for (int i = 0; i < automata.length; i++) { automataPostings.add(new ArrayList<>()); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index a35591b71ae4..30dbdd4da4b5 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -23,7 +23,6 @@ import java.util.Collections; import java.util.List; import java.util.function.Function; -import java.util.function.Predicate; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.FilteringTokenFilter; @@ -33,7 +32,6 @@ import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -56,25 +54,22 @@ public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer, memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets memIndexLeafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable // preFilter for MemoryIndex - preMemIndexFilterAutomaton = buildCombinedAutomaton(components.getFieldMatcher(), terms, this.automata, components.getPhraseHelper(), multiTermQueryRewrite); + preMemIndexFilterAutomaton = buildCombinedAutomaton(components, multiTermQueryRewrite); } /** * Build one {@link CharacterRunAutomaton} matching any term the query might match. */ - private static CharacterRunAutomaton buildCombinedAutomaton(Predicate fieldMatcher, - BytesRef[] terms, - CharacterRunAutomaton[] automata, - PhraseHelper strictPhrases, + private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components, Function> multiTermQueryRewrite) { List allAutomata = new ArrayList<>(); - if (terms.length > 0) { - allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms)))); + if (components.getTerms().length > 0) { + allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms())))); } - Collections.addAll(allAutomata, automata); - for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) { + Collections.addAll(allAutomata, components.getAutomata()); + for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) { Collections.addAll(allAutomata, - MultiTermHighlighting.extractAutomata(spanQuery, fieldMatcher, true, multiTermQueryRewrite));//true==lookInSpan + MultiTermHighlighting.extractAutomata(spanQuery, components.getFieldMatcher(), true, multiTermQueryRewrite));//true==lookInSpan } if (allAutomata.size() == 1) { @@ -105,7 +100,7 @@ public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) // Filter the tokenStream to applicable terms tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton); memoryIndex.reset(); - memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close() + memoryIndex.addField(getField(), tokenStream);//note: calls tokenStream.reset() & close() if (reader == null) { return createOffsetsEnumFromReader(memIndexLeafReader, 0); @@ -114,7 +109,7 @@ public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) new OverlaySingleDocTermsLeafReader( reader, memIndexLeafReader, - field, + getField(), docId), docId); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java index b73087e47a73..0e6a2216751e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java @@ -88,7 +88,7 @@ public int compareTo(OffsetsEnum other) { * The term at this position. * This BytesRef is safe to continue to refer to, even after we move to the next position. * - * @see {@link Passage#getMatchTerms()}. + * @see Passage#getMatchTerms() */ public abstract BytesRef getTerm() throws IOException; diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java index ab391213aba8..b0249fa56a3f 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java @@ -34,11 +34,11 @@ public PostingsWithTermVectorsOffsetStrategy(UHComponents components) { @Override public OffsetsEnum getOffsetsEnum(LeafReader leafReader, int docId, String content) throws IOException { - Terms docTerms = leafReader.getTermVector(docId, field); + Terms docTerms = leafReader.getTermVector(docId, getField()); if (docTerms == null) { return OffsetsEnum.EMPTY; } - leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms, field); + leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms, getField()); return createOffsetsEnumFromReader(leafReader, docId); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java index 895e95e50919..94528e2c29b4 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java @@ -40,17 +40,17 @@ public UnifiedHighlighter.OffsetSource getOffsetSource() { @Override public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException { - Terms tvTerms = reader.getTermVector(docId, field); + Terms tvTerms = reader.getTermVector(docId, getField()); if (tvTerms == null) { return OffsetsEnum.EMPTY; } - LeafReader singleDocReader = new TermVectorLeafReader(field, tvTerms); + LeafReader singleDocReader = new TermVectorLeafReader(getField(), tvTerms); return createOffsetsEnumFromReader( new OverlaySingleDocTermsLeafReader( reader, singleDocReader, - field, + getField(), docId), docId); } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java index e829769909ef..677ee4b7b021 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java @@ -46,7 +46,7 @@ public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer convertTermsToAutomata(components.getTerms(), components.getAutomata()), components.getHighlightFlags()), indexAnalyzer); - assert phraseHelper.hasPositionSensitivity() == false; + assert components.getPhraseHelper().hasPositionSensitivity() == false; } private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) { @@ -67,7 +67,7 @@ public String toString() { @Override public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException { - return new TokenStreamOffsetsEnum(tokenStream(content), automata); + return new TokenStreamOffsetsEnum(tokenStream(content), components.getAutomata()); } private static class TokenStreamOffsetsEnum extends OffsetsEnum { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java index 9719fab27485..eed1e9c7de19 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UHComponents.java @@ -33,9 +33,9 @@ public class UHComponents { private final String field; private final Predicate fieldMatcher; private final Query query; - private final BytesRef[] terms; - private final PhraseHelper phraseHelper; - private final CharacterRunAutomaton[] automata; + private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive) + private final PhraseHelper phraseHelper; // Query: position-sensitive information + private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive private final Set highlightFlags; public UHComponents(String field, Predicate fieldMatcher, Query query, diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 301788b7700f..3e2cc2e5f9a4 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -1283,7 +1283,8 @@ protected Predicate getFieldMatcher(String field) { if (highlighterFieldMatch.getFlags("text").contains(HighlightFlag.WEIGHT_MATCHES)) { assertEquals("This is the text field. You can put some text if you want.", snippets[0]); } else { - //nocommit why does this highlight the first "text"? + // note: odd that the first "text" is highlighted. Apparently WSTE converts PhraseQuery to a SpanNearQuery with + // with inorder=false when non-0 slop. Probably a bug. assertEquals("This is the text field. You can put some text if you want.", snippets[0]); } From a3d87e2297f53afba3ed0b41868b063713103080 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 27 Aug 2018 01:02:40 -0400 Subject: [PATCH 13/14] LUCENE-8286: UH don't use MultiOffsetsEnum for 0 or 1 OEs. --- .../lucene/search/uhighlight/FieldOffsetStrategy.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index 27d9318bae36..7ba80d08afe4 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -100,8 +100,11 @@ protected OffsetsEnum createOffsetsEnumFromReader(LeafReader leafReader, int doc } } - //TODO if only one OE then return it; don't wrap in Multi. If none, return NONE - return new OffsetsEnum.MultiOffsetsEnum(offsetsEnums); + switch (offsetsEnums.size()) { + case 0: return OffsetsEnum.EMPTY; + case 1: return offsetsEnums.get(0); + default: return new OffsetsEnum.MultiOffsetsEnum(offsetsEnums); + } } protected void createOffsetsEnumsWeightMatcher(LeafReader _leafReader, int docId, List results) throws IOException { From f054ffcf580a499a0f180f2b6d2c7cd4839617d0 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 27 Aug 2018 09:31:50 -0400 Subject: [PATCH 14/14] LUCENE-8286: UH document HighlightFlags --- .../search/uhighlight/UnifiedHighlighter.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index 4e4d81528733..e1ece502637f 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -1112,11 +1112,23 @@ public CacheHelper getReaderCacheHelper() { * Flags for controlling highlighting behavior. */ public enum HighlightFlag { + /** @see UnifiedHighlighter#setHighlightPhrasesStrictly(boolean) */ PHRASES, + + /** @see UnifiedHighlighter#setHandleMultiTermQuery(boolean) */ MULTI_TERM_QUERY, + + /** Passage relevancy is more important than speed. True by default. */ PASSAGE_RELEVANCY_OVER_SPEED, + + /** + * Internally use the {@link Weight#matches(LeafReaderContext, int)} API for highlighting. + * It's more accurate to the query, though might not calculate passage relevancy as well. + * Use of this flag requires {@link #MULTI_TERM_QUERY} and {@link #PHRASES}. + * {@link #PASSAGE_RELEVANCY_OVER_SPEED} will be ignored. False by default. + */ WEIGHT_MATCHES - // TODO: ignoreQueryFields + // TODO: useQueryBoosts } }