From 5711d8d477352f440e2a0259366a856ec1db6b30 Mon Sep 17 00:00:00 2001
From: Michael Sokolov <sokolovm@amazon.com>
Date: Sun, 22 Apr 2018 20:41:08 +0000
Subject: [PATCH] LUCENE-8265: WordDelimiter*Filter ignores keywords

---
 .../miscellaneous/WordDelimiterFilter.java    | 13 +++++-
 .../WordDelimiterGraphFilter.java             | 18 ++++++--
 .../TestWordDelimiterFilter.java              | 43 ++++++++++++++-----
 .../TestWordDelimiterGraphFilter.java         | 32 ++++++++++++++
 4 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
index 313386bb523b..16edb3dbef73 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@@ -25,6 +25,7 @@
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@@ -164,7 +165,12 @@ public final class WordDelimiterFilter extends TokenFilter {
    * "O'Neil's" =&gt; "O", "Neil"
    */
   public static final int STEM_ENGLISH_POSSESSIVE = 256;
-  
+
+  /**
+   * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
+   */
+  public static final int IGNORE_KEYWORDS = 512;
+
   /**
    * If not null is the set of tokens to protect from being delimited
    *
@@ -174,6 +180,7 @@ public final class WordDelimiterFilter extends TokenFilter {
   private final int flags;
     
   private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
   private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
   private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
@@ -243,7 +250,9 @@ public boolean incrementToken() throws IOException {
         if (!input.incrementToken()) {
           return false;
         }
-
+        if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
+            return true;
+        }
         int termLength = termAttribute.length();
         char[] termBuffer = termAttribute.buffer();
         
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 7949fa2b1829..7d021c5a71c5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -24,6 +24,7 @@
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@@ -39,7 +40,7 @@
  * work correctly when this filter is used in the search-time analyzer.  Unlike
  * the deprecated {@link WordDelimiterFilter}, this token filter produces a
  * correct token graph as output.  However, it cannot consume an input token
- * graph correctly.
+ * graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true.
  *
  * <p>
  * Words are split into subwords with the following rules:
@@ -156,7 +157,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
    * "O'Neil's" =&gt; "O", "Neil"
    */
   public static final int STEM_ENGLISH_POSSESSIVE = 256;
-  
+
+  /**
+   * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
+   */
+  public static final int IGNORE_KEYWORDS = 512;
+
   /**
    * If not null is the set of tokens to protect from being delimited
    *
@@ -174,6 +180,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
   private char[][] bufferedTermParts = new char[4][];
   
   private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
   private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
   private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
@@ -225,7 +232,8 @@ public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int config
           PRESERVE_ORIGINAL |
           SPLIT_ON_CASE_CHANGE |
           SPLIT_ON_NUMERICS |
-          STEM_ENGLISH_POSSESSIVE)) != 0) {
+          STEM_ENGLISH_POSSESSIVE |
+          IGNORE_KEYWORDS)) != 0) {
       throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags);
     }
     this.flags = configurationFlags;
@@ -335,7 +343,9 @@ public boolean incrementToken() throws IOException {
         if (input.incrementToken() == false) {
           return false;
         }
-
+        if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
+            return true;
+        }
         int termLength = termAttribute.length();
         char[] termBuffer = termAttribute.buffer();
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index 2804bfd5a07e..f945cd6d39db 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -27,7 +27,6 @@
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.IOUtils;
-import org.junit.Test;
 
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
@@ -57,7 +56,6 @@ public void testPerformance() throws IOException {
   }
   ***/
 
-  @Test
   public void testOffsets() throws IOException {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     // test that subwords and catenated subwords have
@@ -77,7 +75,6 @@ public void testOffsets() throws IOException {
         new int[] { 6, 6, 6 });
   }
   
-  @Test
   public void testOffsetChange() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -88,7 +85,6 @@ public void testOffsetChange() throws Exception {
         new int[] { 15 });
   }
   
-  @Test
   public void testOffsetChange2() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -99,7 +95,6 @@ public void testOffsetChange2() throws Exception {
         new int[] { 17 });
   }
   
-  @Test
   public void testOffsetChange3() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -110,7 +105,6 @@ public void testOffsetChange3() throws Exception {
         new int[] { 16 });
   }
   
-  @Test
   public void testOffsetChange4() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -129,7 +123,6 @@ public void doSplit(final String input, String... output) throws Exception {
     assertTokenStreamContents(wdf, output);
   }
 
-  @Test
   public void testSplits() throws Exception {
     doSplit("basic-split","basic","split");
     doSplit("camelCase","camel","Case");
@@ -175,7 +168,6 @@ public void doSplitPossessive(int stemPossessive, final String input, final Stri
   /*
    * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. 
    */
-  @Test
   public void testPossessives() throws Exception {
     doSplitPossessive(1, "ra's", "ra");
     doSplitPossessive(0, "ra's", "ra", "s");
@@ -204,7 +196,6 @@ public boolean incrementToken() throws IOException {
     }  
   }
   
-  @Test
   public void testPositionIncrements() throws Exception {
     final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
@@ -323,6 +314,38 @@ public TokenStreamComponents createComponents(String field) {
     IOUtils.close(a, a2, a3);
   }
   
+  public void testKeywordFilter() throws Exception {
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm", "nop", "kpop"});
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm-nop", "kpop"},
+                     new int[]{0, 4, 8, 16},
+                     new int[]{3, 7, 15, 20},
+                     null,
+                     new int[]{1, 1, 1, 1},
+                     null,
+                     false);
+  }
+
+  private Analyzer keywordTestAnalyzer(int flags) throws Exception {
+    return new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
+          private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
+          @Override public boolean isKeyword() {
+            // Marks terms starting with the letter 'k' as keywords
+            return term.toString().charAt(0) == 'k';
+          }
+        };
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null));
+      }
+    };
+  }
+  
   /** concat numbers + words + all */
   public void testLotsOfConcatenating() throws Exception {
     final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
@@ -346,7 +369,7 @@ public TokenStreamComponents createComponents(String field) {
         false);
     a.close();
   }
-  
+
   /** concat numbers + words + all + preserve original */
   public void testLotsOfConcatenating2() throws Exception {
     final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 7516a23fd65c..61ae6c07c4b3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -309,6 +309,38 @@ public TokenStreamComponents createComponents(String field) {
     IOUtils.close(a, a2, a3);
   }
   
+  public void testKeywordFilter() throws Exception {
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm", "nop", "kpop"});
+    assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
+                     "abc-def klm-nop kpop",
+                     new String[] {"abc", "def", "klm-nop", "kpop"},
+                     new int[]{0, 4, 8, 16},
+                     new int[]{3, 7, 15, 20},
+                     null,
+                     new int[]{1, 1, 1, 1},
+                     null,
+                     false);
+  }
+
+  private Analyzer keywordTestAnalyzer(int flags) throws Exception {
+    return new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
+          private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
+          @Override public boolean isKeyword() {
+            // Marks terms starting with the letter 'k' as keywords
+            return term.toString().charAt(0) == 'k';
+          }
+        };
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(kFilter, flags, null));
+      }
+    };
+  }
+
   /** concat numbers + words + all */
   public void testLotsOfConcatenating() throws Exception {
     final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;