From d60ba6a1687566efdcee1f29f1666f4ec9d4fba5 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 25 May 2018 17:53:02 -0400 Subject: [PATCH 1/3] initial move of CompletionTokenStream to ConcatenateGraphFilter --- .../ConcatenateGraphFilter.java} | 130 ++++++++++-------- .../ConcatenateGraphFilterFactory.java | 61 ++++++++ .../miscellaneous/FingerprintFilter.java | 4 +- ...he.lucene.analysis.util.TokenFilterFactory | 1 + .../TestConcatenateGraphFilter.java} | 95 ++++++++----- .../TestConcatenateGraphFilterFactory.java | 55 ++++++++ .../miscellaneous/TestFingerprintFilter.java | 9 ++ .../analysis/TokenStreamToAutomaton.java | 2 + .../suggest/document/CompletionAnalyzer.java | 23 +--- .../suggest/document/CompletionQuery.java | 4 +- .../search/suggest/document/ContextQuery.java | 11 +- .../suggest/document/ContextSuggestField.java | 24 ++-- .../document/FuzzyCompletionQuery.java | 3 +- .../suggest/document/NRTSuggesterBuilder.java | 4 + .../document/PrefixCompletionQuery.java | 3 +- .../search/suggest/document/SuggestField.java | 15 +- .../document/TestContextSuggestField.java | 13 +- .../suggest/document/TestSuggestField.java | 29 +++- 18 files changed, 342 insertions(+), 144 deletions(-) rename lucene/{suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java => analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java} (67%) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java rename lucene/{suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java => analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java} (69%) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java similarity index 67% rename from lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java index 7308e65acc9c..e222d6b680d1 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java @@ -14,14 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.search.suggest.document; +package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeReflector; @@ -31,55 +33,80 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.FiniteStringsIterator; import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.Util; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL; - /** - * Token stream which converts a provided token stream to an automaton. - * The accepted strings enumeration from the automaton are available through the - * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute - * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store - * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}) + * Concatenates/Joins every incoming token with a separator into one output token for every path through the + * token stream (which is a graph). In simple cases this yields one token, but in the presence of any tokens with + * a zero positionIncrmeent (e.g. synonyms) it will be more. This filter uses the token bytes, position increment, + * and position length of the incoming stream. Other attributes are not used or manipulated. * * @lucene.experimental */ -public final class CompletionTokenStream extends TokenStream { +public final class ConcatenateGraphFilter extends TokenFilter { + + /* + * Token stream which converts a provided token stream to an automaton. + * The accepted strings enumeration from the automaton are available through the + * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute + * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store + * a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)}) + */ + + /** + * Represents the separation between tokens, if + * preserveSep is true. + */ + public final static char SEP_CHAR = '\u001F'; + public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES; + public final static boolean DEFAULT_PRESERVE_SEP = true; + public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true; private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class); private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - final TokenStream inputTokenStream; - final boolean preserveSep; - final boolean preservePositionIncrements; - final int maxGraphExpansions; + //nocommit add getters for these instead? Accessor: org.apache.lucene.search.suggest.document.ContextSuggestField.wrapTokenStream + public final boolean preserveSep; + public final boolean preservePositionIncrements; + public final int maxGraphExpansions; + public TokenStream getInput() { return super.input; } // nocommit remove? see ContextSuggestField.wrapTokenStream - private FiniteStringsIterator finiteStrings; + private LimitedFiniteStringsIterator finiteStrings; private BytesRef payload; private CharTermAttribute charTermAttribute; + private State endState; /** * Creates a token stream to convert input to a token stream - * of accepted strings by its automaton. + * of accepted strings by its token stream graph. *

- * The token stream input is converted to an automaton - * with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer} + * This constructor uses the default settings of the constants in this class. */ - CompletionTokenStream(TokenStream inputTokenStream) { + public ConcatenateGraphFilter(TokenStream inputTokenStream) { this(inputTokenStream, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); } - CompletionTokenStream(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { - // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume - // the input stream entirely in the first call to incrementToken - this.inputTokenStream = inputTokenStream; + /** + * Creates a token stream to convert input to a token stream + * of accepted strings by its token stream graph. + * + * @param inputTokenStream The input/incoming TokenStream + * @param preserveSep + * @param preservePositionIncrements Whether to generate holes in the automaton for missing positions + * @param maxGraphExpansions If the tokenStream graph has more than this many possible paths through, then we'll throw + * {@link TooComplexToDeterminizeException} to preserve the stability and memory of the + * machine. + * @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code maxGraphExpansions} + * expansions + * + */ + public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { + super(inputTokenStream); this.preserveSep = preserveSep; this.preservePositionIncrements = preservePositionIncrements; this.maxGraphExpansions = maxGraphExpansions; @@ -87,6 +114,7 @@ public final class CompletionTokenStream extends TokenStream { /** * Sets a payload available throughout successive token stream enumeration + * @lucene.internal */ public void setPayload(BytesRef payload) { this.payload = payload; @@ -94,17 +122,17 @@ public void setPayload(BytesRef payload) { @Override public boolean incrementToken() throws IOException { - clearAttributes(); - if (finiteStrings == null) { - Automaton automaton = toAutomaton(); - finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); - } - IntsRef string = finiteStrings.next(); if (string == null) { return false; } + clearAttributes(); + + if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one... + posIncrAtt.setPositionIncrement(0); // stacked + } + Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8 if (charTermAttribute != null) { charTermAttribute.setLength(0); @@ -119,27 +147,19 @@ public boolean incrementToken() throws IOException { @Override public void end() throws IOException { - super.end(); - if (finiteStrings == null) { - inputTokenStream.end(); - } - } - - @Override - public void close() throws IOException { - if (finiteStrings == null) { - inputTokenStream.close(); - } + restoreState(endState); } + //nocommit move method to before incrementToken @Override public void reset() throws IOException { - super.reset(); - if (hasAttribute(CharTermAttribute.class)) { - // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion - charTermAttribute = getAttribute(CharTermAttribute.class); + Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, then end() + endState = captureState(); + finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); + if (charTermAttribute == null) { + // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion + charTermAttribute = getAttribute(CharTermAttribute.class); // null if none; it's okay } - finiteStrings = null; } /** @@ -163,7 +183,7 @@ public Automaton toAutomaton(boolean unicodeAware) throws IOException { // separator between tokens: final TokenStreamToAutomaton tsta; if (preserveSep) { - tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL); + tsta = new EscapingTokenStreamToAutomaton(SEP_CHAR); } else { // When we're not preserving sep, we don't steal 0xff // byte, so we don't need to do any escaping: @@ -172,14 +192,14 @@ public Automaton toAutomaton(boolean unicodeAware) throws IOException { tsta.setPreservePositionIncrements(preservePositionIncrements); tsta.setUnicodeArcs(unicodeAware); - automaton = tsta.toAutomaton(inputTokenStream); + automaton = tsta.toAutomaton(input); } finally { - IOUtils.closeWhileHandlingException(inputTokenStream); + IOUtils.closeWhileHandlingException(input); } // TODO: we can optimize this somewhat by determinizing // while we convert - automaton = replaceSep(automaton, preserveSep, SEP_LABEL); + automaton = replaceSep(automaton, preserveSep, SEP_CHAR); // This automaton should not blow up during determinize: return Operations.determinize(automaton, maxGraphExpansions); } @@ -240,7 +260,7 @@ private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLab if (t.min == TokenStreamToAutomaton.POS_SEP) { assert t.max == TokenStreamToAutomaton.POS_SEP; if (preserveSep) { - // Remap to SEP_LABEL: + // Remap to SEP_CHAR: result.addTransition(state, t.dest, sepLabel); } else { result.addEpsilon(state, t.dest); @@ -269,6 +289,7 @@ private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLab /** * Attribute providing access to the term builder and UTF-16 conversion + * @lucene.internal */ public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute { /** @@ -283,7 +304,8 @@ public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute { } /** - * Custom attribute implementation for completion token stream + * Implementation of {@link BytesRefBuilderTermAttribute} + * @lucene.internal */ public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute { private final BytesRefBuilder bytes = new BytesRefBuilder(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java new file mode 100644 index 000000000000..d8d103944010 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java @@ -0,0 +1,61 @@ +/* + This software was produced for the U. S. Government + under Contract No. W15P7T-11-C-F600, and is + subject to the Rights in Noncommercial Computer Software + and Noncommercial Computer Software Documentation + Clause 252.227-7014 (JUN 1995) + + Copyright 2013 The MITRE Corporation. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link ConcatenateGraphFilter}. + * + *

+ * nocommit update
+ * 
+ * @see ConcatenateGraphFilter + * @since 7.4.0 + */ +public class ConcatenateGraphFilterFactory extends TokenFilterFactory { + + private boolean preserveSep; + private boolean preservePositionIncrements; + private int maxGraphExpansions; + + public ConcatenateGraphFilterFactory(Map args) { + super(args); + + preserveSep = getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP); + preservePositionIncrements = getBoolean(args, "preservePositionIncrements", ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS); + maxGraphExpansions = getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); + + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new ConcatenateGraphFilter(input, preserveSep, preservePositionIncrements, maxGraphExpansions); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java index dfe06c88fbf2..71dab429191b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java @@ -81,8 +81,7 @@ public FingerprintFilter(TokenStream input, int maxOutputTokenSize, @Override public final boolean incrementToken() throws IOException { - if (uniqueTerms != null) { - // We have already built the single output token - there's no more + if (inputEnded) { return false; } boolean result = buildSingleOutputToken(); @@ -177,6 +176,7 @@ public int compare(Object o1, Object o2) { } }); + //TODO lets append directly to termAttribute? StringBuilder sb = new StringBuilder(); for (Object item : items) { if (sb.length() >= 1) { diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 181192021797..801ac71c0cae 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -63,6 +63,7 @@ org.apache.lucene.analysis.lv.LatvianStemFilterFactory org.apache.lucene.analysis.minhash.MinHashFilterFactory org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory +org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java similarity index 69% rename from lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java index 6f558d1985d9..c4cb0d5aa259 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/CompletionTokenStreamTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java @@ -14,27 +14,27 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.search.suggest.document; +package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; import org.junit.Test; -public class CompletionTokenStreamTest extends BaseTokenStreamTestCase { +public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase { @Test public void testBasic() throws Exception { @@ -42,9 +42,9 @@ public void testBasic() throws Exception { String input = "mykeyword"; BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenStream); + concatStream.setPayload(payload); + PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); assertTokenStreamContents(stream, new String[] {input}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); } @@ -54,9 +54,9 @@ public void testWithNoPreserveSep() throws Exception { String input = "mykeyword another keyword"; BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream, false, false, 100); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenStream, false, false, 100); + concatStream.setPayload(payload); + PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); } @@ -66,14 +66,14 @@ public void testWithMultipleTokens() throws Exception { String input = "mykeyword another keyword"; tokenStream.setReader(new StringReader(input)); BytesRef payload = new BytesRef("payload"); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(tokenStream); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenStream); + concatStream.setPayload(payload); + PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); CharsRefBuilder builder = new CharsRefBuilder(); builder.append("mykeyword"); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append((ConcatenateGraphFilter.SEP_CHAR)); builder.append("another"); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append((ConcatenateGraphFilter.SEP_CHAR)); builder.append("keyword"); assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); } @@ -85,11 +85,11 @@ public void testWithSynonym() throws Exception { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(filter); BytesRef payload = new BytesRef("payload"); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); - assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] { 1, 1 }, null, null); + concatStream.setPayload(payload); + PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); + assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] { 1, 0 }, null, null); } @Test @@ -101,25 +101,50 @@ public void testWithSynonyms() throws Exception { tokenStream.setReader(new StringReader(input)); SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true); BytesRef payload = new BytesRef("payload"); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter, true, false, 100); - completionTokenStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(filter, true, false, 100); + concatStream.setPayload(payload); + PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); String[] expectedOutputs = new String[2]; CharsRefBuilder expectedOutput = new CharsRefBuilder(); expectedOutput.append("mykeyword"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); expectedOutput.append("another"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); expectedOutput.append("keyword"); expectedOutputs[0] = expectedOutput.toCharsRef().toString(); expectedOutput.clear(); expectedOutput.append("mysynonym"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); expectedOutput.append("another"); - expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL)); + expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); expectedOutput.append("keyword"); expectedOutputs[1] = expectedOutput.toCharsRef().toString(); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); + } + + @Test + public void testWithStopword() throws Exception { + for (boolean preservePosInc : new boolean[]{true, false}) { + Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); + String input = "a mykeyword a keyword a"; + tokenStream.setReader(new StringReader(input)); + TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a")); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10); + CharsRefBuilder builder = new CharsRefBuilder(); + if (preservePosInc) { + builder.append((ConcatenateGraphFilter.SEP_CHAR)); + } + builder.append("mykeyword"); + builder.append((ConcatenateGraphFilter.SEP_CHAR)); + if (preservePosInc) { + builder.append((ConcatenateGraphFilter.SEP_CHAR)); + } + builder.append("keyword"); + if (preservePosInc) { + builder.append((ConcatenateGraphFilter.SEP_CHAR)); + } + assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()}); + } } @Test @@ -137,23 +162,25 @@ public void testValidNumberOfExpansions() throws IOException { tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter); - completionTokenStream.setPayload(new BytesRef()); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream); + ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(filter); + concatStream.setPayload(new BytesRef()); + PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); stream.reset(); - CompletionTokenStream.BytesRefBuilderTermAttribute attr = stream.addAttribute(CompletionTokenStream.BytesRefBuilderTermAttribute.class); - PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class); - int maxPos = 0; + ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class); int count = 0; while(stream.incrementToken()) { count++; assertNotNull(attr.getBytesRef()); assertTrue(attr.getBytesRef().length > 0); - maxPos += posAttr.getPositionIncrement(); } stream.close(); assertEquals(count, 256); - assertEquals(count, maxPos); + } + + public void testEmpty() throws IOException { + Tokenizer tokenizer = whitespaceMockTokenizer(""); + ConcatenateGraphFilter filter = new ConcatenateGraphFilter(tokenizer); + assertTokenStreamContents(filter, new String[0]); } public final static class PayloadAttrToTypeAttrFilter extends TokenFilter { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java new file mode 100644 index 000000000000..5b3ebacf5495 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; + +public class TestConcatenateGraphFilterFactory extends BaseTokenStreamFactoryTestCase { + public void test() throws Exception { + for (final boolean consumeAll : new boolean[]{true, false}) { + final String input = "A1 B2 A1 D4 C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + tokenizer.setEnableChecks(consumeAll); + TokenStream stream = tokenizer; + stream = tokenFilterFactory("Concatenate"//nocommit test an arg + ).create(stream); + assertTokenStreamContents(stream, new String[]{input.replace(' ', ConcatenateGraphFilter.SEP_CHAR)}); + } + } + + public void testRequired() throws Exception { + // no params are required + tokenFilterFactory("Concatenate"); + } + + /** + * Test that bogus arguments result in exception + */ + public void testBogusArguments() throws Exception { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> + tokenFilterFactory("Concatenate", "bogusArg", "bogusValue")); + assertTrue(expected.getMessage().contains("Unknown parameters")); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java index 450447ac9bab..76bd617f4088 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java @@ -69,4 +69,13 @@ public void testSingleToken() throws Exception { } } + public void testEmpty() throws Exception { + for (final boolean consumeAll : new boolean[] { true, false }) { + MockTokenizer tokenizer = whitespaceMockTokenizer(""); + tokenizer.setEnableChecks(consumeAll); + TokenStream stream = new FingerprintFilter(tokenizer); + assertTokenStreamContents(stream, new String[0]); + } + } + } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java index 0675abe94a36..881366a93dc5 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java @@ -214,6 +214,8 @@ public Automaton toAutomaton(TokenStream in) throws IOException { if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) { endPosInc = 1; + } else if (endPosInc > 0 && preservePositionIncrements==false) { + endPosInc = 0; } if (endPosInc > 0) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java index 13bd392aa9df..cdb49de2062f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java @@ -19,7 +19,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; /** * Wraps an {@link org.apache.lucene.analysis.Analyzer} @@ -37,24 +37,11 @@ */ public final class CompletionAnalyzer extends AnalyzerWrapper { - /** - * Represents the separation between tokens, if - * preserveSep is true - *

- * Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream} - * payload - */ - final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP; - /** * Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton} */ final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE; - final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES; - final static boolean DEFAULT_PRESERVE_SEP = true; - final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true; - private final Analyzer analyzer; /** @@ -101,7 +88,7 @@ public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preser * preserving token separation, position increments and no limit on graph expansions */ public CompletionAnalyzer(Analyzer analyzer) { - this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } /** @@ -109,7 +96,7 @@ public CompletionAnalyzer(Analyzer analyzer) { * with no limit on graph expansions */ public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) { - this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS); + this(analyzer, preserveSep, preservePositionIncrements, ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); } /** @@ -117,7 +104,7 @@ public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preser * preserving token separation and position increments */ public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) { - this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions); + this(analyzer, ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions); } /** @@ -143,7 +130,7 @@ protected Analyzer getWrappedAnalyzer(String fieldName) { @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { - CompletionTokenStream tokenStream = new CompletionTokenStream(components.getTokenStream(), + ConcatenateGraphFilter tokenStream = new ConcatenateGraphFilter(components.getTokenStream(), preserveSep, preservePositionIncrements, maxGraphExpansions); return new TokenStreamComponents(components.getTokenizer(), tokenStream); } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java index 49fe7d08dff1..1ed13734d88a 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java @@ -27,7 +27,7 @@ import org.apache.lucene.search.suggest.BitsProducer; import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.HOLE_CHARACTER; -import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL; +import static org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter.SEP_CHAR; /** * Abstract {@link Query} that match documents containing terms with a specified prefix @@ -158,7 +158,7 @@ private void validate(String termText) { case HOLE_CHARACTER: throw new IllegalArgumentException( "Term text cannot contain HOLE character U+001E; this character is reserved"); - case SEP_LABEL: + case SEP_CHAR: throw new IllegalArgumentException( "Term text cannot contain unit separator character U+001F; this character is reserved"); default: diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java index 6217ca38f85f..5d56795c4b84 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java @@ -22,6 +22,7 @@ import java.util.Map; import java.util.TreeSet; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Weight; @@ -175,10 +176,10 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo return new CompletionWeight(this, innerAutomaton); } - // if separators are preserved the fst contains a SEP_LABEL + // if separators are preserved the fst contains a SEP_CHAR // behind each gap. To have a matching automaton, we need to - // include the SEP_LABEL in the query as well - Automaton optionalSepLabel = Operations.optional(Automata.makeChar(CompletionAnalyzer.SEP_LABEL)); + // include the SEP_CHAR in the query as well + Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_CHAR)); Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton); Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton); contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); @@ -302,9 +303,9 @@ private void setInnerWeight(IntsRef ref, int offset) { } ref.offset = ++i; assert ref.offset < ref.length : "input should not end with the context separator"; - if (ref.ints[i] == CompletionAnalyzer.SEP_LABEL) { + if (ref.ints[i] == ConcatenateGraphFilter.SEP_CHAR) { ref.offset++; - assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_LABEL"; + assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_CHAR"; } ref.length = ref.length - ref.offset; refBuilder.copyInts(ref.ints, ref.offset, ref.length); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java index 4cb91b8053c5..84a99c07c658 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -83,23 +84,24 @@ protected Iterable contexts() { } @Override - protected CompletionTokenStream wrapTokenStream(TokenStream stream) { + protected ConcatenateGraphFilter wrapTokenStream(TokenStream stream) { final Iterable contexts = contexts(); for (CharSequence context : contexts) { validate(context); } - CompletionTokenStream completionTokenStream; - if (stream instanceof CompletionTokenStream) { - completionTokenStream = (CompletionTokenStream) stream; - PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(completionTokenStream.inputTokenStream, (char) CONTEXT_SEPARATOR, contexts); - completionTokenStream = new CompletionTokenStream(prefixTokenFilter, - completionTokenStream.preserveSep, - completionTokenStream.preservePositionIncrements, - completionTokenStream.maxGraphExpansions); + ConcatenateGraphFilter concatStream; + if (stream instanceof ConcatenateGraphFilter) { + //nocommit this is awkward; is there a better way avoiding re-creating the chain? + concatStream = (ConcatenateGraphFilter) stream; + PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(concatStream.getInput(), (char) CONTEXT_SEPARATOR, contexts); + concatStream = new ConcatenateGraphFilter(prefixTokenFilter, + concatStream.preserveSep, + concatStream.preservePositionIncrements, + concatStream.maxGraphExpansions); } else { - completionTokenStream = new CompletionTokenStream(new PrefixTokenFilter(stream, (char) CONTEXT_SEPARATOR, contexts)); + concatStream = new ConcatenateGraphFilter(new PrefixTokenFilter(stream, (char) CONTEXT_SEPARATOR, contexts)); } - return completionTokenStream; + return concatStream; } @Override diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java index b243f4ede83d..b5754e8d765c 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java @@ -23,6 +23,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; @@ -144,7 +145,7 @@ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, i @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); + ConcatenateGraphFilter stream = (ConcatenateGraphFilter) analyzer.tokenStream(getField(), getTerm().text()); Set refs = new HashSet<>(); Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs); if (unicodeAware) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java index 270463175d79..1dd2b1f7ae49 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.PriorityQueue; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -43,6 +44,9 @@ final class NRTSuggesterBuilder { * in the output */ public static final int PAYLOAD_SEP = '\u001F'; + static { + assert PAYLOAD_SEP == ConcatenateGraphFilter.SEP_CHAR; + } /** * Marks end of the analyzed input and start of dedup diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java index 7bb75e9261ca..2cffc761f69f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java @@ -19,6 +19,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; @@ -68,7 +69,7 @@ public PrefixCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); + ConcatenateGraphFilter stream = (ConcatenateGraphFilter) analyzer.tokenStream(getField(), getTerm().text()); return new CompletionWeight(this, stream.toAutomaton()); } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java index 7f06328ee1b0..0ee338f4e8c2 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexOptions; @@ -100,21 +101,21 @@ public SuggestField(String name, String value, int weight) { @Override public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { - CompletionTokenStream completionStream = wrapTokenStream(super.tokenStream(analyzer, reuse)); + ConcatenateGraphFilter completionStream = wrapTokenStream(super.tokenStream(analyzer, reuse)); completionStream.setPayload(buildSuggestPayload()); return completionStream; } /** - * Wraps a stream with a CompletionTokenStream. + * Wraps a stream with a ConcatenateGraphFilter. * * Subclasses can override this method to change the indexing pipeline. */ - protected CompletionTokenStream wrapTokenStream(TokenStream stream) { - if (stream instanceof CompletionTokenStream) { - return (CompletionTokenStream) stream; + protected ConcatenateGraphFilter wrapTokenStream(TokenStream stream) { + if (stream instanceof ConcatenateGraphFilter) { + return (ConcatenateGraphFilter) stream; } else { - return new CompletionTokenStream(stream); + return new ConcatenateGraphFilter(stream); } } @@ -140,7 +141,7 @@ private BytesRef buildSuggestPayload() { private boolean isReserved(char c) { switch (c) { - case CompletionAnalyzer.SEP_LABEL: + case ConcatenateGraphFilter.SEP_CHAR: case CompletionAnalyzer.HOLE_CHARACTER: case NRTSuggesterBuilder.END_BYTE: return true; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java index 0c3b254c132e..23b1267a19a1 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -109,21 +110,21 @@ public void testTokenStream() throws Exception { CharsRefBuilder builder = new CharsRefBuilder(); builder.append("context1"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append(ConcatenateGraphFilter.SEP_CHAR); builder.append("input"); expectedOutputs[0] = builder.toCharsRef().toString(); builder.clear(); builder.append("context2"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(((char) CompletionAnalyzer.SEP_LABEL)); + builder.append(ConcatenateGraphFilter.SEP_CHAR); builder.append("input"); expectedOutputs[1] = builder.toCharsRef().toString(); - TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null)); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + TokenStream stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null)); + assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); - stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null)); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 1}, null, null); + stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null)); + assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); } @Test diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index a6659e082d5d..a083535917ac 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -32,7 +32,11 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene70.Lucene70Codec; @@ -99,7 +103,7 @@ public void testNegativeWeight() throws Exception { public void testReservedChars() throws Exception { CharsRefBuilder charsRefBuilder = new CharsRefBuilder(); charsRefBuilder.append("sugg"); - charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.SEP_LABEL); + charsRefBuilder.setCharAt(2, ConcatenateGraphFilter.SEP_CHAR); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { new SuggestField("name", charsRefBuilder.toString(), 1); }); @@ -144,11 +148,11 @@ public void testTokenStream() throws Exception { output.writeByte(SuggestField.TYPE); } BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray()); - TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null)); + TokenStream stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null)); assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); - stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null)); + stream = new PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null)); assertTokenStreamContents(stream, new String[] {"input"}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); } @@ -894,4 +898,23 @@ public PostingsFormat getPostingsFormatForField(String field) { iwc.setCodec(filterCodec); return iwc; } + + public final static class PayloadAttrToTypeAttrFilter extends TokenFilter { + private PayloadAttribute payload = addAttribute(PayloadAttribute.class); + private TypeAttribute type = addAttribute(TypeAttribute.class); + + protected PayloadAttrToTypeAttrFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + // we move them over so we can assert them more easily in the tests + type.setType(payload.getPayload().utf8ToString()); + return true; + } + return false; + } + } } From d813eca1353d32c4187970141beefe7e784bd2a3 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Thu, 31 May 2018 16:39:09 -0400 Subject: [PATCH 2/3] * ConcatenateGraphFilter now extends TokenStream and calls toAutomaton in incrementToken() * ConcatenateGraphFilter SEP_CHAR put back to SEP_LABEL (int) * ConcatenateGraphFilter end() like was before; no captureState * Added a CompletionTokenStream wrapper to ConcatenateGraphFilter to ** provide the NRT suggester access to state it wants (now is private in CGF) ** setPayload (now removed from CGF). * Fixed CGF Factory license header to be ASF --- .../miscellaneous/ConcatenateGraphFilter.java | 108 ++++++++++-------- .../ConcatenateGraphFilterFactory.java | 55 +++++---- .../analysis/core/TestRandomChains.java | 7 +- .../TestConcatenateGraphFilter.java | 98 ++++++---------- .../TestConcatenateGraphFilterFactory.java | 38 +++++- .../suggest/document/CompletionAnalyzer.java | 2 +- .../suggest/document/CompletionQuery.java | 4 +- .../document/CompletionTokenStream.java | 82 +++++++++++++ .../search/suggest/document/ContextQuery.java | 10 +- .../suggest/document/ContextSuggestField.java | 17 ++- .../document/FuzzyCompletionQuery.java | 3 +- .../suggest/document/NRTSuggesterBuilder.java | 5 +- .../document/PrefixCompletionQuery.java | 3 +- .../search/suggest/document/SuggestField.java | 12 +- .../document/TestContextSuggestField.java | 4 +- .../suggest/document/TestSuggestField.java | 2 +- 16 files changed, 271 insertions(+), 179 deletions(-) create mode 100644 lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java index e222d6b680d1..b90ed45260a0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java @@ -18,11 +18,9 @@ import java.io.IOException; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.AttributeImpl; @@ -47,7 +45,7 @@ * * @lucene.experimental */ -public final class ConcatenateGraphFilter extends TokenFilter { +public final class ConcatenateGraphFilter extends TokenStream { /* * Token stream which converts a provided token stream to an automaton. @@ -61,25 +59,22 @@ public final class ConcatenateGraphFilter extends TokenFilter { * Represents the separation between tokens, if * preserveSep is true. */ - public final static char SEP_CHAR = '\u001F'; + public final static int SEP_LABEL = TokenStreamToAutomaton.POS_SEP; public final static int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES; public final static boolean DEFAULT_PRESERVE_SEP = true; public final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true; - private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class); private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - //nocommit add getters for these instead? Accessor: org.apache.lucene.search.suggest.document.ContextSuggestField.wrapTokenStream - public final boolean preserveSep; - public final boolean preservePositionIncrements; - public final int maxGraphExpansions; - public TokenStream getInput() { return super.input; } // nocommit remove? see ContextSuggestField.wrapTokenStream + private final TokenStream inputTokenStream; + private final boolean preserveSep; + private final boolean preservePositionIncrements; + private final int maxGraphExpansions; private LimitedFiniteStringsIterator finiteStrings; - private BytesRef payload; private CharTermAttribute charTermAttribute; - private State endState; + private boolean wasReset = false; /** * Creates a token stream to convert input to a token stream @@ -96,8 +91,11 @@ public ConcatenateGraphFilter(TokenStream inputTokenStream) { * of accepted strings by its token stream graph. * * @param inputTokenStream The input/incoming TokenStream - * @param preserveSep - * @param preservePositionIncrements Whether to generate holes in the automaton for missing positions + * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the concatenated token + * @param preservePositionIncrements Whether to add an empty token for missing positions. + * The effect is a consecutive {@link #SEP_LABEL}. + * When false, it's as if there were no missing positions + * (we pretend the surrounding tokens were adjacent). * @param maxGraphExpansions If the tokenStream graph has more than this many possible paths through, then we'll throw * {@link TooComplexToDeterminizeException} to preserve the stability and memory of the * machine. @@ -106,22 +104,34 @@ public ConcatenateGraphFilter(TokenStream inputTokenStream) { * */ public ConcatenateGraphFilter(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { - super(inputTokenStream); + // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume + // the input stream entirely in the first call to incrementToken + this.inputTokenStream = inputTokenStream; this.preserveSep = preserveSep; this.preservePositionIncrements = preservePositionIncrements; this.maxGraphExpansions = maxGraphExpansions; } - /** - * Sets a payload available throughout successive token stream enumeration - * @lucene.internal - */ - public void setPayload(BytesRef payload) { - this.payload = payload; + @Override + public void reset() throws IOException { + super.reset(); + // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion + charTermAttribute = getAttribute(CharTermAttribute.class); // may return null + finiteStrings = null; + wasReset = true; } @Override public boolean incrementToken() throws IOException { + if (finiteStrings == null) { + if (wasReset == false) { + throw new IllegalStateException("reset() missing before incrementToken"); + } + // lazy init/consume + Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, end(), and close() on inputTokenStream + finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); + } + IntsRef string = finiteStrings.next(); if (string == null) { return false; @@ -138,52 +148,53 @@ public boolean incrementToken() throws IOException { charTermAttribute.setLength(0); charTermAttribute.append(bytesAtt.toUTF16()); } - if (payload != null) { - payloadAttr.setPayload(this.payload); - } return true; } @Override public void end() throws IOException { - restoreState(endState); + super.end(); + //nocommit I think we don't need to delegate this under any condition + if (finiteStrings == null) { + //toAutomaton not called yet, so delegate lifecycle + inputTokenStream.end(); + } } - //nocommit move method to before incrementToken @Override - public void reset() throws IOException { - Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, then end() - endState = captureState(); - finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); - if (charTermAttribute == null) { - // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion - charTermAttribute = getAttribute(CharTermAttribute.class); // null if none; it's okay + public void close() throws IOException { + super.close(); + if (finiteStrings == null) { + //toAutomaton not called yet, so delegate lifecycle + inputTokenStream.close(); + } else { + finiteStrings = null; } + wasReset = false;//reset } /** - * Converts the token stream to an automaton, - * treating the transition labels as utf-8 + * Converts the tokenStream to an automaton, treating the transition labels as utf-8. Closes it. */ public Automaton toAutomaton() throws IOException { return toAutomaton(false); } /** - * Converts the tokenStream to an automaton + * Converts the tokenStream to an automaton. Closes it. */ public Automaton toAutomaton(boolean unicodeAware) throws IOException { // TODO refactor this // maybe we could hook up a modified automaton from TermAutomatonQuery here? - Automaton automaton = null; + Automaton automaton; try { // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: final TokenStreamToAutomaton tsta; if (preserveSep) { - tsta = new EscapingTokenStreamToAutomaton(SEP_CHAR); + tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL); } else { // When we're not preserving sep, we don't steal 0xff // byte, so we don't need to do any escaping: @@ -192,28 +203,29 @@ public Automaton toAutomaton(boolean unicodeAware) throws IOException { tsta.setPreservePositionIncrements(preservePositionIncrements); tsta.setUnicodeArcs(unicodeAware); - automaton = tsta.toAutomaton(input); + automaton = tsta.toAutomaton(inputTokenStream); } finally { - IOUtils.closeWhileHandlingException(input); + IOUtils.closeWhileHandlingException(inputTokenStream); } // TODO: we can optimize this somewhat by determinizing // while we convert - automaton = replaceSep(automaton, preserveSep, SEP_CHAR); + automaton = replaceSep(automaton, preserveSep, SEP_LABEL); // This automaton should not blow up during determinize: return Operations.determinize(automaton, maxGraphExpansions); } /** - * Just escapes the 0xff byte (which we still for SEP). + * Just escapes the {@link #SEP_LABEL} byte with an extra. */ private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { final BytesRefBuilder spare = new BytesRefBuilder(); - private char sepLabel; + final byte sepLabel; - public EscapingTokenStreamToAutomaton(char sepLabel) { - this.sepLabel = sepLabel; + public EscapingTokenStreamToAutomaton(int sepLabel) { + assert sepLabel <= Byte.MAX_VALUE; + this.sepLabel = (byte) sepLabel; } @Override @@ -221,9 +233,9 @@ protected BytesRef changeToken(BytesRef in) { int upto = 0; for (int i = 0; i < in.length; i++) { byte b = in.bytes[in.offset + i]; - if (b == (byte) sepLabel) { + if (b == sepLabel) { spare.grow(upto + 2); - spare.setByteAt(upto++, (byte) sepLabel); + spare.setByteAt(upto++, sepLabel); spare.setByteAt(upto++, b); } else { spare.grow(upto + 1); @@ -260,7 +272,7 @@ private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLab if (t.min == TokenStreamToAutomaton.POS_SEP) { assert t.max == TokenStreamToAutomaton.POS_SEP; if (preserveSep) { - // Remap to SEP_CHAR: + // Remap to SEP_LABEL: result.addTransition(state, t.dest, sepLabel); } else { result.addEpsilon(state, t.dest); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java index d8d103944010..5d8ccbacf3d9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilterFactory.java @@ -1,38 +1,47 @@ /* - This software was produced for the U. S. Government - under Contract No. W15P7T-11-C-F600, and is - subject to the Rights in Noncommercial Computer Software - and Noncommercial Computer Software Documentation - Clause 252.227-7014 (JUN 1995) - - Copyright 2013 The MITRE Corporation. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ - package org.apache.lucene.analysis.miscellaneous; import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; /** * Factory for {@link ConcatenateGraphFilter}. * - *

- * nocommit update
- * 
+ * * @see ConcatenateGraphFilter * @since 7.4.0 */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 8cb159129ef6..d94b39607bb8 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -72,6 +72,7 @@ import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; import org.apache.lucene.analysis.minhash.MinHashFilter; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; @@ -119,10 +120,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase { private static final Set> avoidConditionals = new HashSet<>(); static { - // Fingerprint filter needs to consume the whole tokenstream, so conditionals don't make sense here + // These filters needs to consume the whole tokenstream, so conditionals don't make sense here avoidConditionals.add(FingerprintFilter.class); - // Ditto MinHashFilter avoidConditionals.add(MinHashFilter.class); + avoidConditionals.add(ConcatenateGraphFilter.class); } private static final Map,Predicate> brokenConstructors = new HashMap<>(); @@ -156,7 +157,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return !((Boolean) args[2]); // args are broken if consumeAllTokens is false }); for (Class c : Arrays.>asList( - // doesn't actual reset itself! + // doesn't actual reset itself! TODO this statement is probably obsolete as of LUCENE-6121 ? CachingTokenFilter.class, // LUCENE-8092: doesn't handle graph inputs CJKBigramFilter.class, diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java index c4cb0d5aa259..925d5405e2a6 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilter.java @@ -23,12 +23,9 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; @@ -36,28 +33,24 @@ public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase { + private static final char SEP_LABEL = (char) ConcatenateGraphFilter.SEP_LABEL; + @Test public void testBasic() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword"; - BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenStream); - concatStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); - assertTokenStreamContents(stream, new String[] {input}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream); + assertTokenStreamContents(stream, new String[] {input}, null, null, new int[] { 1 }); } @Test public void testWithNoPreserveSep() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword another keyword"; - BytesRef payload = new BytesRef("payload"); tokenStream.setReader(new StringReader(input)); - ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenStream, false, false, 100); - concatStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); - assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new String[] {payload.utf8ToString()}, new int[] { 1 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, false, false, 100); + assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 }); } @Test @@ -65,17 +58,14 @@ public void testWithMultipleTokens() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "mykeyword another keyword"; tokenStream.setReader(new StringReader(input)); - BytesRef payload = new BytesRef("payload"); - ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenStream); - concatStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream); CharsRefBuilder builder = new CharsRefBuilder(); builder.append("mykeyword"); - builder.append((ConcatenateGraphFilter.SEP_CHAR)); + builder.append(SEP_LABEL); builder.append("another"); - builder.append((ConcatenateGraphFilter.SEP_CHAR)); + builder.append(SEP_LABEL); builder.append("keyword"); - assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new String[]{payload.utf8ToString()}, new int[]{1}, null, null); + assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1}); } @Test @@ -85,11 +75,8 @@ public void testWithSynonym() throws Exception { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(filter); - BytesRef payload = new BytesRef("payload"); - concatStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); - assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] { 1, 0 }, null, null); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter); + assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new int[] { 1, 0 }); } @Test @@ -101,25 +88,23 @@ public void testWithSynonyms() throws Exception { tokenStream.setReader(new StringReader(input)); SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true); BytesRef payload = new BytesRef("payload"); - ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(filter, true, false, 100); - concatStream.setPayload(payload); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); + ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, true, false, 100); String[] expectedOutputs = new String[2]; CharsRefBuilder expectedOutput = new CharsRefBuilder(); expectedOutput.append("mykeyword"); - expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); + expectedOutput.append(SEP_LABEL); expectedOutput.append("another"); - expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); + expectedOutput.append(SEP_LABEL); expectedOutput.append("keyword"); expectedOutputs[0] = expectedOutput.toCharsRef().toString(); expectedOutput.clear(); expectedOutput.append("mysynonym"); - expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); + expectedOutput.append(SEP_LABEL); expectedOutput.append("another"); - expectedOutput.append(ConcatenateGraphFilter.SEP_CHAR); + expectedOutput.append(SEP_LABEL); expectedOutput.append("keyword"); expectedOutputs[1] = expectedOutput.toCharsRef().toString(); - assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null); + assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0}); } @Test @@ -132,16 +117,16 @@ public void testWithStopword() throws Exception { ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, true, preservePosInc, 10); CharsRefBuilder builder = new CharsRefBuilder(); if (preservePosInc) { - builder.append((ConcatenateGraphFilter.SEP_CHAR)); + builder.append(SEP_LABEL); } builder.append("mykeyword"); - builder.append((ConcatenateGraphFilter.SEP_CHAR)); + builder.append(SEP_LABEL); if (preservePosInc) { - builder.append((ConcatenateGraphFilter.SEP_CHAR)); + builder.append(SEP_LABEL); } builder.append("keyword"); if (preservePosInc) { - builder.append((ConcatenateGraphFilter.SEP_CHAR)); + builder.append(SEP_LABEL); } assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()}); } @@ -162,18 +147,17 @@ public void testValidNumberOfExpansions() throws IOException { tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); - ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(filter); - concatStream.setPayload(new BytesRef()); - PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(concatStream); - stream.reset(); - ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class); - int count = 0; - while(stream.incrementToken()) { - count++; - assertNotNull(attr.getBytesRef()); - assertTrue(attr.getBytesRef().length > 0); + int count; + try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) { + stream.reset(); + ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class); + count = 0; + while (stream.incrementToken()) { + count++; + assertNotNull(attr.getBytesRef()); + assertTrue(attr.getBytesRef().length > 0); + } } - stream.close(); assertEquals(count, 256); } @@ -183,22 +167,4 @@ public void testEmpty() throws IOException { assertTokenStreamContents(filter, new String[0]); } - public final static class PayloadAttrToTypeAttrFilter extends TokenFilter { - private PayloadAttribute payload = addAttribute(PayloadAttribute.class); - private TypeAttribute type = addAttribute(TypeAttribute.class); - - protected PayloadAttrToTypeAttrFilter(TokenStream input) { - super(input); - } - - @Override - public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - // we move them over so we can assert them more easily in the tests - type.setType(payload.getPayload().utf8ToString()); - return true; - } - return false; - } - } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java index 5b3ebacf5495..1e149f03b1ba 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateGraphFilterFactory.java @@ -21,6 +21,7 @@ import java.io.StringReader; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; @@ -33,15 +34,42 @@ public void test() throws Exception { tokenizer.setReader(reader); tokenizer.setEnableChecks(consumeAll); TokenStream stream = tokenizer; - stream = tokenFilterFactory("Concatenate"//nocommit test an arg - ).create(stream); - assertTokenStreamContents(stream, new String[]{input.replace(' ', ConcatenateGraphFilter.SEP_CHAR)}); + stream = tokenFilterFactory("ConcatenateGraph").create(stream); + assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)}); } } + public void testPreserveSep() throws Exception { + final String input = "A1 B2 A1 D4 C3"; + final String output = "A1A1D4C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + TokenStream stream = tokenizer; + stream = new StopFilter(stream, StopFilter.makeStopSet("B2")); + stream = tokenFilterFactory("ConcatenateGraph", + "preserveSep", "false" + ).create(stream); + assertTokenStreamContents(stream, new String[]{output}); + } + + public void testPreservePositionIncrements() throws Exception { + final String input = "A1 B2 A1 D4 C3"; + final String output = "A1 A1 D4 C3"; + Reader reader = new StringReader(input); + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + tokenizer.setReader(reader); + TokenStream stream = tokenizer; + stream = new StopFilter(stream, StopFilter.makeStopSet("B2")); + stream = tokenFilterFactory("ConcatenateGraph", + "preservePositionIncrements", "false" + ).create(stream); + assertTokenStreamContents(stream, new String[]{output.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)}); + } + public void testRequired() throws Exception { // no params are required - tokenFilterFactory("Concatenate"); + tokenFilterFactory("ConcatenateGraph"); } /** @@ -49,7 +77,7 @@ public void testRequired() throws Exception { */ public void testBogusArguments() throws Exception { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> - tokenFilterFactory("Concatenate", "bogusArg", "bogusValue")); + tokenFilterFactory("ConcatenateGraph", "bogusArg", "bogusValue")); assertTrue(expected.getMessage().contains("Unknown parameters")); } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java index cdb49de2062f..8888382a5caa 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java @@ -130,7 +130,7 @@ protected Analyzer getWrappedAnalyzer(String fieldName) { @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { - ConcatenateGraphFilter tokenStream = new ConcatenateGraphFilter(components.getTokenStream(), + CompletionTokenStream tokenStream = new CompletionTokenStream(components.getTokenStream(), preserveSep, preservePositionIncrements, maxGraphExpansions); return new TokenStreamComponents(components.getTokenizer(), tokenStream); } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java index 1ed13734d88a..6be0c91117f3 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionQuery.java @@ -27,7 +27,7 @@ import org.apache.lucene.search.suggest.BitsProducer; import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.HOLE_CHARACTER; -import static org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter.SEP_CHAR; +import static org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter.SEP_LABEL; /** * Abstract {@link Query} that match documents containing terms with a specified prefix @@ -158,7 +158,7 @@ private void validate(String termText) { case HOLE_CHARACTER: throw new IllegalArgumentException( "Term text cannot contain HOLE character U+001E; this character is reserved"); - case SEP_CHAR: + case SEP_LABEL: throw new IllegalArgumentException( "Term text cannot contain unit separator character U+001F; this character is reserved"); default: diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java new file mode 100644 index 000000000000..0713f5bc17b9 --- /dev/null +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.suggest.document; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; + +/** + * A {@link ConcatenateGraphFilter} but we can set the payload and provide access to config options. + * @lucene.internal + */ +public final class CompletionTokenStream extends TokenFilter { + // package accessible on purpose + final TokenStream inputTokenStream; + final boolean preserveSep; + final boolean preservePositionIncrements; + final int maxGraphExpansions; + + private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); + + private BytesRef payload; // note doesn't participate in TokenStream lifecycle; it's effectively constant + + CompletionTokenStream(TokenStream inputTokenStream) { + this(inputTokenStream, + ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP, + ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS, + ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS); + } + + CompletionTokenStream(TokenStream inputTokenStream, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) { + super(new ConcatenateGraphFilter(inputTokenStream, preserveSep, preservePositionIncrements, maxGraphExpansions)); + this.inputTokenStream = inputTokenStream; + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.maxGraphExpansions = maxGraphExpansions; + } + + public void setPayload(BytesRef payload) { + this.payload = payload; + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + payloadAtt.setPayload(payload); + return true; + } else { + return false; + } + } + + /** @see ConcatenateGraphFilter#toAutomaton() */ + public Automaton toAutomaton() throws IOException { + return ((ConcatenateGraphFilter)input).toAutomaton(); + } + + /** @see ConcatenateGraphFilter#toAutomaton(boolean) */ + public Automaton toAutomaton(boolean unicodeAware) throws IOException { + return ((ConcatenateGraphFilter)input).toAutomaton(unicodeAware); + } +} diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java index 5d56795c4b84..1a2680cb553e 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java @@ -176,10 +176,10 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo return new CompletionWeight(this, innerAutomaton); } - // if separators are preserved the fst contains a SEP_CHAR + // if separators are preserved the fst contains a SEP_LABEL // behind each gap. To have a matching automaton, we need to - // include the SEP_CHAR in the query as well - Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_CHAR)); + // include the SEP_LABEL in the query as well + Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_LABEL)); Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton); Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton); contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); @@ -303,9 +303,9 @@ private void setInnerWeight(IntsRef ref, int offset) { } ref.offset = ++i; assert ref.offset < ref.length : "input should not end with the context separator"; - if (ref.ints[i] == ConcatenateGraphFilter.SEP_CHAR) { + if (ref.ints[i] == ConcatenateGraphFilter.SEP_LABEL) { ref.offset++; - assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_CHAR"; + assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_LABEL"; } ref.length = ref.length - ref.offset; refBuilder.copyInts(ref.ints, ref.offset, ref.length); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java index 84a99c07c658..d608339a1f8b 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextSuggestField.java @@ -24,7 +24,6 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -84,22 +83,22 @@ protected Iterable contexts() { } @Override - protected ConcatenateGraphFilter wrapTokenStream(TokenStream stream) { + protected CompletionTokenStream wrapTokenStream(TokenStream stream) { final Iterable contexts = contexts(); for (CharSequence context : contexts) { validate(context); } - ConcatenateGraphFilter concatStream; - if (stream instanceof ConcatenateGraphFilter) { - //nocommit this is awkward; is there a better way avoiding re-creating the chain? - concatStream = (ConcatenateGraphFilter) stream; - PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(concatStream.getInput(), (char) CONTEXT_SEPARATOR, contexts); - concatStream = new ConcatenateGraphFilter(prefixTokenFilter, + CompletionTokenStream concatStream; + if (stream instanceof CompletionTokenStream) { + //TODO this is awkward; is there a better way avoiding re-creating the chain? + concatStream = (CompletionTokenStream) stream; + PrefixTokenFilter prefixTokenFilter = new PrefixTokenFilter(concatStream.inputTokenStream, (char) CONTEXT_SEPARATOR, contexts); + concatStream = new CompletionTokenStream(prefixTokenFilter, concatStream.preserveSep, concatStream.preservePositionIncrements, concatStream.maxGraphExpansions); } else { - concatStream = new ConcatenateGraphFilter(new PrefixTokenFilter(stream, (char) CONTEXT_SEPARATOR, contexts)); + concatStream = new CompletionTokenStream(new PrefixTokenFilter(stream, (char) CONTEXT_SEPARATOR, contexts)); } return concatStream; } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java index b5754e8d765c..b243f4ede83d 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java @@ -23,7 +23,6 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; @@ -145,7 +144,7 @@ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, i @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - ConcatenateGraphFilter stream = (ConcatenateGraphFilter) analyzer.tokenStream(getField(), getTerm().text()); + CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); Set refs = new HashSet<>(); Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs); if (unicodeAware) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java index 1dd2b1f7ae49..5ca4993396fa 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java @@ -43,10 +43,7 @@ final class NRTSuggesterBuilder { * Label used to separate surface form and docID * in the output */ - public static final int PAYLOAD_SEP = '\u001F'; - static { - assert PAYLOAD_SEP == ConcatenateGraphFilter.SEP_CHAR; - } + public static final int PAYLOAD_SEP = ConcatenateGraphFilter.SEP_LABEL; /** * Marks end of the analyzed input and start of dedup diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java index 2cffc761f69f..7bb75e9261ca 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java @@ -19,7 +19,6 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; @@ -69,7 +68,7 @@ public PrefixCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - ConcatenateGraphFilter stream = (ConcatenateGraphFilter) analyzer.tokenStream(getField(), getTerm().text()); + CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); return new CompletionWeight(this, stream.toAutomaton()); } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java index 0ee338f4e8c2..ce8c2e94cd13 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java @@ -101,7 +101,7 @@ public SuggestField(String name, String value, int weight) { @Override public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { - ConcatenateGraphFilter completionStream = wrapTokenStream(super.tokenStream(analyzer, reuse)); + CompletionTokenStream completionStream = wrapTokenStream(super.tokenStream(analyzer, reuse)); completionStream.setPayload(buildSuggestPayload()); return completionStream; } @@ -111,11 +111,11 @@ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { * * Subclasses can override this method to change the indexing pipeline. */ - protected ConcatenateGraphFilter wrapTokenStream(TokenStream stream) { - if (stream instanceof ConcatenateGraphFilter) { - return (ConcatenateGraphFilter) stream; + protected CompletionTokenStream wrapTokenStream(TokenStream stream) { + if (stream instanceof CompletionTokenStream) { + return (CompletionTokenStream) stream; } else { - return new ConcatenateGraphFilter(stream); + return new CompletionTokenStream(stream); } } @@ -141,7 +141,7 @@ private BytesRef buildSuggestPayload() { private boolean isReserved(char c) { switch (c) { - case ConcatenateGraphFilter.SEP_CHAR: + case ConcatenateGraphFilter.SEP_LABEL: case CompletionAnalyzer.HOLE_CHARACTER: case NRTSuggesterBuilder.END_BYTE: return true; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java index 23b1267a19a1..8beea1296227 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextSuggestField.java @@ -110,13 +110,13 @@ public void testTokenStream() throws Exception { CharsRefBuilder builder = new CharsRefBuilder(); builder.append("context1"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(ConcatenateGraphFilter.SEP_CHAR); + builder.append((char) ConcatenateGraphFilter.SEP_LABEL); builder.append("input"); expectedOutputs[0] = builder.toCharsRef().toString(); builder.clear(); builder.append("context2"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); - builder.append(ConcatenateGraphFilter.SEP_CHAR); + builder.append((char) ConcatenateGraphFilter.SEP_LABEL); builder.append("input"); expectedOutputs[1] = builder.toCharsRef().toString(); TokenStream stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null)); diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index a083535917ac..e6d7062c925b 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -103,7 +103,7 @@ public void testNegativeWeight() throws Exception { public void testReservedChars() throws Exception { CharsRefBuilder charsRefBuilder = new CharsRefBuilder(); charsRefBuilder.append("sugg"); - charsRefBuilder.setCharAt(2, ConcatenateGraphFilter.SEP_CHAR); + charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { new SuggestField("name", charsRefBuilder.toString(), 1); }); From ac301b4075e494070d5851e358b4626b69e6439f Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 1 Jun 2018 00:24:23 -0400 Subject: [PATCH 3/3] Made comply more with TokenStream contract * made it cranky if close not called. toAutomaton no longer closes. * added proper end offset tracking. --- .../miscellaneous/ConcatenateGraphFilter.java | 61 +++++++++---------- .../document/FuzzyCompletionQuery.java | 7 ++- .../document/PrefixCompletionQuery.java | 5 +- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java index b90ed45260a0..eb16c36866eb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.AttributeImpl; @@ -28,7 +29,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; @@ -66,6 +66,7 @@ public final class ConcatenateGraphFilter extends TokenStream { private final BytesRefBuilderTermAttribute bytesAtt = addAttribute(BytesRefBuilderTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final TokenStream inputTokenStream; private final boolean preserveSep; @@ -75,6 +76,7 @@ public final class ConcatenateGraphFilter extends TokenStream { private LimitedFiniteStringsIterator finiteStrings; private CharTermAttribute charTermAttribute; private boolean wasReset = false; + private int endOffset; /** * Creates a token stream to convert input to a token stream @@ -128,8 +130,10 @@ public boolean incrementToken() throws IOException { throw new IllegalStateException("reset() missing before incrementToken"); } // lazy init/consume - Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, end(), and close() on inputTokenStream + Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); + //note: would be nice to know the startOffset but toAutomaton doesn't capture it. We'll assume 0 + endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset(); } IntsRef string = finiteStrings.next(); @@ -143,6 +147,8 @@ public boolean incrementToken() throws IOException { posIncrAtt.setPositionIncrement(0); // stacked } + offsetAtt.setOffset(0, endOffset); + Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8 if (charTermAttribute != null) { charTermAttribute.setLength(0); @@ -155,58 +161,47 @@ public boolean incrementToken() throws IOException { @Override public void end() throws IOException { super.end(); - //nocommit I think we don't need to delegate this under any condition - if (finiteStrings == null) { - //toAutomaton not called yet, so delegate lifecycle - inputTokenStream.end(); - } + offsetAtt.setOffset(0, endOffset); } @Override public void close() throws IOException { super.close(); - if (finiteStrings == null) { - //toAutomaton not called yet, so delegate lifecycle - inputTokenStream.close(); - } else { - finiteStrings = null; - } + //delegate lifecycle. Note toAutomaton does not close the stream + inputTokenStream.close(); wasReset = false;//reset + endOffset = -1;//reset } /** - * Converts the tokenStream to an automaton, treating the transition labels as utf-8. Closes it. + * Converts the tokenStream to an automaton, treating the transition labels as utf-8. Does *not* close it. */ public Automaton toAutomaton() throws IOException { return toAutomaton(false); } /** - * Converts the tokenStream to an automaton. Closes it. + * Converts the tokenStream to an automaton. Does *not* close it. */ public Automaton toAutomaton(boolean unicodeAware) throws IOException { // TODO refactor this // maybe we could hook up a modified automaton from TermAutomatonQuery here? - Automaton automaton; - try { - // Create corresponding automaton: labels are bytes - // from each analyzed token, with byte 0 used as - // separator between tokens: - final TokenStreamToAutomaton tsta; - if (preserveSep) { - tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL); - } else { - // When we're not preserving sep, we don't steal 0xff - // byte, so we don't need to do any escaping: - tsta = new TokenStreamToAutomaton(); - } - tsta.setPreservePositionIncrements(preservePositionIncrements); - tsta.setUnicodeArcs(unicodeAware); - automaton = tsta.toAutomaton(inputTokenStream); - } finally { - IOUtils.closeWhileHandlingException(inputTokenStream); + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + final TokenStreamToAutomaton tsta; + if (preserveSep) { + tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL); + } else { + // When we're not preserving sep, we don't steal 0xff + // byte, so we don't need to do any escaping: + tsta = new TokenStreamToAutomaton(); } + tsta.setPreservePositionIncrements(preservePositionIncrements); + tsta.setUnicodeArcs(unicodeAware); + + Automaton automaton = tsta.toAutomaton(inputTokenStream); // TODO: we can optimize this somewhat by determinizing // while we convert diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java index b243f4ede83d..14479fecd123 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java @@ -144,9 +144,12 @@ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, i @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); + final Automaton originalAutomata; + try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) { + originalAutomata = stream.toAutomaton(unicodeAware); + } Set refs = new HashSet<>(); - Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs); + Automaton automaton = toLevenshteinAutomata(originalAutomata, refs); if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(automaton); utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java index 7bb75e9261ca..a8da150f5045 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/PrefixCompletionQuery.java @@ -68,8 +68,9 @@ public PrefixCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); - return new CompletionWeight(this, stream.toAutomaton()); + try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) { + return new CompletionWeight(this, stream.toAutomaton()); + } } /**