From 5711d8d477352f440e2a0259366a856ec1db6b30 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Sun, 22 Apr 2018 20:41:08 +0000 Subject: [PATCH] LUCENE-8265: WordDelimiter*Filter ignores keywords --- .../miscellaneous/WordDelimiterFilter.java | 13 +++++- .../WordDelimiterGraphFilter.java | 18 ++++++-- .../TestWordDelimiterFilter.java | 43 ++++++++++++++----- .../TestWordDelimiterGraphFilter.java | 32 ++++++++++++++ 4 files changed, 90 insertions(+), 16 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index 313386bb523b..16edb3dbef73 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; @@ -164,7 +165,12 @@ public final class WordDelimiterFilter extends TokenFilter { * "O'Neil's" => "O", "Neil" */ public static final int STEM_ENGLISH_POSSESSIVE = 256; - + + /** + * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true. + */ + public static final int IGNORE_KEYWORDS = 512; + /** * If not null is the set of tokens to protect from being delimited * @@ -174,6 +180,7 @@ public final class WordDelimiterFilter extends TokenFilter { private final int flags; private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);; private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); @@ -243,7 +250,9 @@ public boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } - + if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) { + return true; + } int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index 7949fa2b1829..7d021c5a71c5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; @@ -39,7 +40,7 @@ * work correctly when this filter is used in the search-time analyzer. Unlike * the deprecated {@link WordDelimiterFilter}, this token filter produces a * correct token graph as output. However, it cannot consume an input token - * graph correctly. + * graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true. * *

* Words are split into subwords with the following rules: @@ -156,7 +157,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter { * "O'Neil's" => "O", "Neil" */ public static final int STEM_ENGLISH_POSSESSIVE = 256; - + + /** + * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true. + */ + public static final int IGNORE_KEYWORDS = 512; + /** * If not null is the set of tokens to protect from being delimited * @@ -174,6 +180,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter { private char[][] bufferedTermParts = new char[4][]; private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);; private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class); @@ -225,7 +232,8 @@ public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int config PRESERVE_ORIGINAL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | - STEM_ENGLISH_POSSESSIVE)) != 0) { + STEM_ENGLISH_POSSESSIVE | + IGNORE_KEYWORDS)) != 0) { throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags); } this.flags = configurationFlags; @@ -335,7 +343,9 @@ public boolean incrementToken() throws IOException { if (input.incrementToken() == false) { return false; } - + if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) { + return true; + } int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java index 2804bfd5a07e..f945cd6d39db 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java @@ -27,7 +27,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.IOUtils; -import org.junit.Test; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; @@ -57,7 +56,6 @@ public void testPerformance() throws IOException { } ***/ - @Test public void testOffsets() throws IOException { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; // test that subwords and catenated subwords have @@ -77,7 +75,6 @@ public void testOffsets() throws IOException { new int[] { 6, 6, 6 }); } - @Test public void testOffsetChange() throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null); @@ -88,7 +85,6 @@ public void testOffsetChange() throws Exception { new int[] { 15 }); } - @Test public void testOffsetChange2() throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null); @@ -99,7 +95,6 @@ public void testOffsetChange2() throws Exception { new int[] { 17 }); } - @Test public void testOffsetChange3() throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null); @@ -110,7 +105,6 @@ public void testOffsetChange3() throws Exception { new int[] { 16 }); } - @Test public void testOffsetChange4() throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null); @@ -129,7 +123,6 @@ public void doSplit(final String input, String... output) throws Exception { assertTokenStreamContents(wdf, output); } - @Test public void testSplits() throws Exception { doSplit("basic-split","basic","split"); doSplit("camelCase","camel","Case"); @@ -175,7 +168,6 @@ public void doSplitPossessive(int stemPossessive, final String input, final Stri /* * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. */ - @Test public void testPossessives() throws Exception { doSplitPossessive(1, "ra's", "ra"); doSplitPossessive(0, "ra's", "ra", "s"); @@ -204,7 +196,6 @@ public boolean incrementToken() throws IOException { } } - @Test public void testPositionIncrements() throws Exception { final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false); @@ -323,6 +314,38 @@ public TokenStreamComponents createComponents(String field) { IOUtils.close(a, a2, a3); } + public void testKeywordFilter() throws Exception { + assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS), + "abc-def klm-nop kpop", + new String[] {"abc", "def", "klm", "nop", "kpop"}); + assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS), + "abc-def klm-nop kpop", + new String[] {"abc", "def", "klm-nop", "kpop"}, + new int[]{0, 4, 8, 16}, + new int[]{3, 7, 15, 20}, + null, + new int[]{1, 1, 1, 1}, + null, + false); + } + + private Analyzer keywordTestAnalyzer(int flags) throws Exception { + return new Analyzer() { + @Override + public TokenStreamComponents createComponents(String field) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) { + private final CharTermAttribute term = addAttribute(CharTermAttribute.class); + @Override public boolean isKeyword() { + // Marks terms starting with the letter 'k' as keywords + return term.toString().charAt(0) == 'k'; + } + }; + return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null)); + } + }; + } + /** concat numbers + words + all */ public void testLotsOfConcatenating() throws Exception { final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; @@ -346,7 +369,7 @@ public TokenStreamComponents createComponents(String field) { false); a.close(); } - + /** concat numbers + words + all + preserve original */ public void testLotsOfConcatenating2() throws Exception { final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java index 7516a23fd65c..61ae6c07c4b3 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java @@ -309,6 +309,38 @@ public TokenStreamComponents createComponents(String field) { IOUtils.close(a, a2, a3); } + public void testKeywordFilter() throws Exception { + assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS), + "abc-def klm-nop kpop", + new String[] {"abc", "def", "klm", "nop", "kpop"}); + assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS), + "abc-def klm-nop kpop", + new String[] {"abc", "def", "klm-nop", "kpop"}, + new int[]{0, 4, 8, 16}, + new int[]{3, 7, 15, 20}, + null, + new int[]{1, 1, 1, 1}, + null, + false); + } + + private Analyzer keywordTestAnalyzer(int flags) throws Exception { + return new Analyzer() { + @Override + public TokenStreamComponents createComponents(String field) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) { + private final CharTermAttribute term = addAttribute(CharTermAttribute.class); + @Override public boolean isKeyword() { + // Marks terms starting with the letter 'k' as keywords + return term.toString().charAt(0) == 'k'; + } + }; + return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(kFilter, flags, null)); + } + }; + } + /** concat numbers + words + all */ public void testLotsOfConcatenating() throws Exception { final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;