From ca80f34ec0c60a8a01a687f670b25283982e75be Mon Sep 17 00:00:00 2001
From: Ingomar Wesp <ingomar@wesp.name>
Date: Fri, 30 Mar 2018 23:40:17 +0200
Subject: [PATCH] N-Gram filters: Add options to keep original terms.

Adds the following properties to EdgeNGramTokenFilter &
NGramTokenFilter:
- keepShortTerm: Don't drop input terms smaller than minGramSize.
- keepLongTerm: Don't drop input terms longer than maxGramSize.
---
 .../ngram/EdgeNGramFilterFactory.java         |   6 +-
 .../analysis/ngram/EdgeNGramTokenFilter.java  |  77 +++++++----
 .../analysis/ngram/NGramFilterFactory.java    |   6 +-
 .../analysis/ngram/NGramTokenFilter.java      |  87 ++++++++-----
 .../ngram/EdgeNGramTokenFilterTest.java       | 120 ++++++++++++++----
 .../analysis/ngram/NGramTokenFilterTest.java  | 120 +++++++++++++++---
 6 files changed, 316 insertions(+), 100 deletions(-)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java
index 020b85bb5e92..6bc830a3ca4e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java
@@ -36,12 +36,16 @@
 public class EdgeNGramFilterFactory extends TokenFilterFactory {
   private final int maxGramSize;
   private final int minGramSize;
+  private final boolean keepShortTerm;
+  private final boolean keepLongTerm;
 
   /** Creates a new EdgeNGramFilterFactory */
   public EdgeNGramFilterFactory(Map<String, String> args) {
     super(args);
     minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
     maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
+    keepShortTerm = getBoolean(args, "keepShortTerm", EdgeNGramTokenFilter.DEFAULT_KEEP_SHORT_TERM);
+    keepLongTerm = getBoolean(args, "keepLongTerm", EdgeNGramTokenFilter.DEFAULT_KEEP_LONG_TERM);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -49,6 +53,6 @@ public EdgeNGramFilterFactory(Map<String, String> args) {
 
   @Override
   public TokenFilter create(TokenStream input) {
-    return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
+    return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, keepShortTerm, keepLongTerm);
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 56efd897d178..08013da99f88 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -34,27 +34,35 @@
 public final class EdgeNGramTokenFilter extends TokenFilter {
   public static final int DEFAULT_MAX_GRAM_SIZE = 1;
   public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+  public static final boolean DEFAULT_KEEP_SHORT_TERM = false;
+  public static final boolean DEFAULT_KEEP_LONG_TERM = false;
 
   private final int minGram;
   private final int maxGram;
+  private final boolean keepShortTerm;
+  private final boolean keepLongTerm;
+
   private char[] curTermBuffer;
   private int curTermLength;
-  private int curCodePointCount;
+  private int curTermCodePointCount;
   private int curGramSize;
-  private int savePosIncr;
+  private int curPosIncr;
   private State state;
   
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final CharTermAttribute termAtt;
+  private final PositionIncrementAttribute posIncrAtt;
 
   /**
-   * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+   * Creates EdgeNGramTokenFilter that generates edge n-grams of sizes in the given range.
    *
    * @param input {@link TokenStream} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
+   * @param keepShortTerm whether to pass through tokens that are shorter than minGram
+   * @param keepLongTerm whether to pass through tokens that are longer than maxGram
    */
-  public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
+  public EdgeNGramTokenFilter(
+      TokenStream input, int minGram, int maxGram, boolean keepShortTerm, boolean keepLongTerm) {
     super(input);
 
     if (minGram < 1) {
@@ -67,6 +75,15 @@ public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
 
     this.minGram = minGram;
     this.maxGram = maxGram;
+    this.keepShortTerm = keepShortTerm;
+    this.keepLongTerm = keepLongTerm;
+    
+    this.termAtt = addAttribute(CharTermAttribute.class);
+    this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  }
+
+  public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
+    this(input, minGram, maxGram, DEFAULT_KEEP_SHORT_TERM, DEFAULT_KEEP_LONG_TERM);
   }
 
   @Override
@@ -75,32 +92,46 @@ public final boolean incrementToken() throws IOException {
       if (curTermBuffer == null) {
         if (!input.incrementToken()) {
           return false;
-        } else {
-          curTermBuffer = termAtt.buffer().clone();
-          curTermLength = termAtt.length();
-          curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
-          curGramSize = minGram;
-          state = captureState();
-          savePosIncr += posIncrAtt.getPositionIncrement();
         }
+        state = captureState();
+        
+        curTermLength = termAtt.length();
+        curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength);
+        curPosIncr += posIncrAtt.getPositionIncrement();
+
+        if (keepShortTerm && curTermCodePointCount < minGram) {
+          // Token is shorter than minGram, but we'd still like to keep it.
+          posIncrAtt.setPositionIncrement(curPosIncr);
+          curPosIncr = 0;
+          return true;
+        }
+        
+        curTermBuffer = termAtt.buffer().clone();
+        curGramSize = minGram;
       }
-      if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
-        if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
-          // grab gramSize chars from front or back
+
+      if (curGramSize <= curTermCodePointCount) {
+        if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram
           restoreState(state);
           // first ngram gets increment, others don't
-          if (curGramSize == minGram) {
-            posIncrAtt.setPositionIncrement(savePosIncr);
-            savePosIncr = 0;
-          } else {
-            posIncrAtt.setPositionIncrement(0);
-          }
+          posIncrAtt.setPositionIncrement(curPosIncr);
+          curPosIncr = 0;
+
           final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
           termAtt.copyBuffer(curTermBuffer, 0, charLength);
           curGramSize++;
           return true;
         }
+        else if (keepLongTerm) {
+          // Token is longer than maxGram, but we'd still like to keep it.
+          restoreState(state);
+          posIncrAtt.setPositionIncrement(0);
+          termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
+          curTermBuffer = null;
+          return true;
+        }
       }
+      // Done with this input token, get next token on the next iteration.
       curTermBuffer = null;
     }
   }
@@ -109,6 +140,6 @@ public final boolean incrementToken() throws IOException {
   public void reset() throws IOException {
     super.reset();
     curTermBuffer = null;
-    savePosIncr = 0;
+    curPosIncr = 0;
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
index 2064716b78b7..60165be8b6ad 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
@@ -36,12 +36,16 @@
 public class NGramFilterFactory extends TokenFilterFactory {
   private final int maxGramSize;
   private final int minGramSize;
+  private final boolean keepShortTerm;
+  private final boolean keepLongTerm;
 
   /** Creates a new NGramFilterFactory */
   public NGramFilterFactory(Map<String, String> args) {
     super(args);
     minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
     maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
+    keepShortTerm = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_KEEP_SHORT_TERM);
+    keepLongTerm = getBoolean(args, "keepLongTerm", NGramTokenFilter.DEFAULT_KEEP_LONG_TERM);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -49,6 +53,6 @@ public NGramFilterFactory(Map<String, String> args) {
 
   @Override
   public TokenFilter create(TokenStream input) {
-    return new NGramTokenFilter(input, minGramSize, maxGramSize);
+    return new NGramTokenFilter(input, minGramSize, maxGramSize, keepShortTerm, keepLongTerm);
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index a2e0aa7e5884..fb21e30d16f2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -21,7 +21,6 @@
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 
@@ -42,28 +41,27 @@
 public final class NGramTokenFilter extends TokenFilter {
   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+  public static final boolean DEFAULT_KEEP_SHORT_TERM = false;
+  public static final boolean DEFAULT_KEEP_LONG_TERM = false;
 
-  private final int minGram, maxGram;
+  private final int minGram;
+  private final int maxGram;
+  private final boolean keepShortTerm;
+  private final boolean keepLongTerm;
 
   private char[] curTermBuffer;
   private int curTermLength;
-  private int curCodePointCount;
+  private int curTermCodePointCount;
   private int curGramSize;
   private int curPos;
-  private int curPosInc;
+  private int curPosIncr;
   private State state;
 
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final PositionIncrementAttribute posIncAtt;
+  private final CharTermAttribute termAtt;
+  private final PositionIncrementAttribute posIncrAtt;
 
-  /**
-   * Creates NGramTokenFilter with given min and max n-grams.
-   * @param input {@link TokenStream} holding the input to be tokenized
-   * @param minGram the smallest n-gram to generate
-   * @param maxGram the largest n-gram to generate
-   */
-  public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
-    super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
+  public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean keepShortTerm, boolean keepLongTerm) {
+    super(input);
     if (minGram < 1) {
       throw new IllegalArgumentException("minGram must be greater than zero");
     }
@@ -72,8 +70,21 @@ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
+    this.keepShortTerm = keepShortTerm;
+    this.keepLongTerm = keepLongTerm;
 
-    posIncAtt = addAttribute(PositionIncrementAttribute.class);
+    this.termAtt = addAttribute(CharTermAttribute.class);
+    this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  }
+  
+  /**
+   * Creates NGramTokenFilter with given min and max n-grams.
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
+    this(input, minGram, maxGram, DEFAULT_KEEP_SHORT_TERM, DEFAULT_KEEP_LONG_TERM);
   }
 
   /**
@@ -84,39 +95,56 @@ public NGramTokenFilter(TokenStream input) {
     this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
   }
 
-  /** Returns the next token in the stream, or null at EOS. */
   @Override
   public final boolean incrementToken() throws IOException {
     while (true) {
       if (curTermBuffer == null) {
         if (!input.incrementToken()) {
           return false;
-        } else {
-          curTermBuffer = termAtt.buffer().clone();
-          curTermLength = termAtt.length();
-          curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
-          curGramSize = minGram;
-          curPos = 0;
-          curPosInc = posIncAtt.getPositionIncrement();
-          state = captureState();
         }
+        state = captureState();
+        
+        curTermLength = termAtt.length();
+        curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
+        curPosIncr += posIncrAtt.getPositionIncrement();
+        curPos = 0;
+        
+        if (keepShortTerm && curTermCodePointCount < minGram) {
+          // Token is shorter than minGram, but we'd still like to keep it.
+          posIncrAtt.setPositionIncrement(curPosIncr);
+          curPosIncr = 0;
+          return true;
+        }
+        
+        curTermBuffer = termAtt.buffer().clone();
+        curGramSize = minGram;
       }
 
-      if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
+      if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) {
         ++curPos;
         curGramSize = minGram;
       }
-      if ((curPos + curGramSize) <= curCodePointCount) {
+      if ((curPos + curGramSize) <= curTermCodePointCount) {
         restoreState(state);
         final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
         final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
         termAtt.copyBuffer(curTermBuffer, start, end - start);
-        posIncAtt.setPositionIncrement(curPosInc);
-        curPosInc = 0;
+        posIncrAtt.setPositionIncrement(curPosIncr);
+        curPosIncr = 0;
         curGramSize++;
         return true;
       }
-      curTermBuffer = null;
+      else if (keepLongTerm && curTermCodePointCount > maxGram) {
+        // Token is longer than maxGram, but we'd still like to keep it.
+        restoreState(state);
+        posIncrAtt.setPositionIncrement(0);
+        termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
+        curTermBuffer = null;
+        return true;
+      }
+      
+      // Done with this input token, get next token on next iteration.
+      curTermBuffer = null;  
     }
   }
 
@@ -124,5 +152,6 @@ public final boolean incrementToken() throws IOException {
   public void reset() throws IOException {
     super.reset();
     curTermBuffer = null;
+    curPosIncr = 0;
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
index d7536e7050f3..b4bf6b33ea82 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@@ -76,6 +76,55 @@ public void testOversizedNgrams() throws Exception {
     assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
   }
 
+  public void testOversizedNgramsKeepShortTerm() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true, false);
+    assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
+  }
+
+  public void testKeepShortTermKeepLongTerm() throws Exception {
+    final String inputString = "a bcd efghi jk";
+
+    { // default behaviour
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3);
+      assertTokenStreamContents(filter,
+          new String[] { "bc", "bcd",  "ef", "efg",  "jk" },
+          new int[]    {    2,     2,    6,      6,    12 },
+          new int[]    {    5,     5,   11,     11,    14 },
+          new int[]    {    2,     0,    1,      0,     1 });
+    }
+
+    { // keepShortTerm && keepLongTerm
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true, true);
+      assertTokenStreamContents(filter,
+          new String[] { "a", "bc", "bcd",  "ef", "efg", "efghi", "jk" },
+          new int[]    {  0,     2,     2,    6,      6,      6,    12 },
+          new int[]    {  1,     5,     5,   11,     11,     11,    14 },
+          new int[]    {  1,     1,     0,    1,      0,      0,     1 });
+    }
+    
+    { // keepShortTerm && !keepLongTerm
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true, false);
+      assertTokenStreamContents(filter,
+          new String[] { "a", "bc", "bcd",  "ef", "efg", "jk" },
+          new int[]    {  0,     2,     2,    6,      6,   12 },
+          new int[]    {  1,     5,     5,   11,     11,   14 },
+          new int[]    {  1,     1,     0,    1,      0,    1 });
+    }
+    
+    { // !keepShortTerm && keepLongTerm
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false, true);
+      assertTokenStreamContents(filter,
+          new String[] { "bc", "bcd",  "ef", "efg", "efghi", "jk" },
+          new int[]    {    2,     2,    6,      6,      6,    12 },
+          new int[]    {    5,     5,   11,     11,     11,    14 },
+          new int[]    {    2,     0,    1,      0,      0,     1 });
+    }
+  }
+
   public void testFrontRangeOfNgrams() throws Exception {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3);
     assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
@@ -85,14 +134,9 @@ public void testFilterPositions() throws Exception {
     TokenStream ts = whitespaceMockTokenizer("abcde vwxyz");
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3);
     assertTokenStreamContents(tokenizer,
-                              new String[]{"a","ab","abc","v","vw","vwx"},
-                              new int[]{0,0,0,6,6,6},
-                              new int[]{5,5,5,11,11,11},
-                              null,
-                              new int[]{1,0,0,1,0,0},
-                              null,
-                              null,
-                              false);
+        new String[] {"a","ab","abc","v","vw","vwx"},
+        new int[]      {0,   0,    0,  6,   6,    6},
+        new int[]      {5,   5,    5, 11,  11,   11});
   }
 
   private static class PositionFilter extends TokenFilter {
@@ -160,13 +204,15 @@ public void testRandomStrings() throws Exception {
     for (int i = 0; i < 10; i++) {
       final int min = TestUtil.nextInt(random(), 2, 10);
       final int max = TestUtil.nextInt(random(), min, 20);
+      final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+      final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
     
       Analyzer a = new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
           return new TokenStreamComponents(tokenizer, 
-            new EdgeNGramTokenFilter(tokenizer, min, max));
+            new EdgeNGramTokenFilter(tokenizer, min, max, keepShortTerm, keepLongTerm));
         }    
       };
       checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
@@ -204,23 +250,45 @@ public void testGraphs() throws IOException {
   }
 
   public void testSupplementaryCharacters() throws IOException {
-    final String s = TestUtil.randomUnicodeString(random(), 10);
-    final int codePointCount = s.codePointCount(0, s.length());
-    final int minGram = TestUtil.nextInt(random(), 1, 3);
-    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
-    TokenStream tk = new KeywordTokenizer();
-    ((Tokenizer)tk).setReader(new StringReader(s));
-    tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
-    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
-    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
-    tk.reset();
-    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
-      assertTrue(tk.incrementToken());
-      assertEquals(0, offsetAtt.startOffset());
-      assertEquals(s.length(), offsetAtt.endOffset());
-      final int end = Character.offsetByCodePoints(s, 0, i);
-      assertEquals(s.substring(0, end), termAtt.toString());
+    for (int i = 0; i < 20; i++) {
+      final String s = TestUtil.randomUnicodeString(random(), 10);
+      final int codePointCount = s.codePointCount(0, s.length());
+      final int minGram = TestUtil.nextInt(random(), 1, 3);
+      final int maxGram = TestUtil.nextInt(random(), minGram, 10);
+      final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+      final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+
+      TokenStream tk = new KeywordTokenizer();
+      ((Tokenizer)tk).setReader(new StringReader(s));
+      tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, keepShortTerm, keepLongTerm);
+      final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
+      final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
+      tk.reset();
+
+      if (codePointCount < minGram && keepShortTerm) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        assertEquals(s, termAtt.toString());
+      }
+
+      for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        final int end = Character.offsetByCodePoints(s, 0, j);
+        assertEquals(s.substring(0, end), termAtt.toString());
+      }
+
+      if (codePointCount > maxGram && keepLongTerm) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        assertEquals(s, termAtt.toString());
+      }
+
+      assertFalse(tk.incrementToken());
+      tk.close();
     }
-    assertFalse(tk.incrementToken());
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
index d8591a9726ec..3c1bed1b8015 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@@ -97,10 +97,22 @@ public void testOversizedNgrams() throws Exception {
     assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
   }
   
+  public void testOversizedNgramsKeepShortTerm() throws Exception {
+    NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true, false);
+    assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
+  }
+  
   public void testSmallTokenInStream() throws Exception {
     input = whitespaceMockTokenizer("abc de fgh");
-    NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
-    assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
+    NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3);
+    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
+  }
+  
+  public void testSmallTokenInStreamKeepShortTerm() throws Exception {
+    input = whitespaceMockTokenizer("abc de fgh");
+    NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true, false);
+    assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1});
+
   }
   
   public void testReset() throws Exception {
@@ -112,6 +124,50 @@ public void testReset() throws Exception {
     assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
   }
   
+  public void testKeepShortTermKeepLongTerm() throws Exception {
+    final String inputString = "a bcd efghi jk";
+
+    { // default behaviour
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3);
+      assertTokenStreamContents(filter,
+          new String[] { "bc", "bcd",  "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" },
+          new int[]    {    2,     2,     2,    6,     6,    6,     6,    6,     6,    6,   12 },
+          new int[]    {    5,     5,     5,   11,    11,   11,    11,   11,    11,   11,   14 },
+          new int[]    {    2,     0,     0,    1,     0,    0,     0,    0,     0,    0,    1 });
+    }
+
+    { // keepShortTerm && keepLongTerm
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true, true);
+      assertTokenStreamContents(filter,
+          new String[] { "a", "bc", "bcd",  "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" },
+          new int[]    {   0,    2,     2,     2,    6,     6,    6,     6,    6,     6,    6,       6,   12 },
+          new int[]    {   1,    5,     5,     5,   11,    11,   11,    11,   11,    11,   11,      11,   14 },
+          new int[]    {   1,    1,     0,     0,    1,     0,    0,     0,    0,     0,    0,       0,    1 });
+    }
+    
+    { // keepShortTerm && !keepLongTerm
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true, false);
+      assertTokenStreamContents(filter,
+          new String[] { "a", "bc", "bcd",  "cd", "ef",  "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" },
+          new int[]    {   0,    2,     2,     2,    6,      6,    6,     6,    6,     6,    6,   12 },
+          new int[]    {   1,    5,     5,     5,   11,     11,   11,    11,   11,    11,   11,   14 },
+          new int[]    {   1,    1,     0,     0,    1,      0,    0,     0,    0,     0,    0,    1 });
+    }
+    
+    { // !keepShortTerm && keepLongTerm
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false, true);
+      assertTokenStreamContents(filter,
+          new String[] { "bc", "bcd",  "cd", "ef",  "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" },
+          new int[]    {    2,     2,     2,    6,      6,    6,     6,    6,     6,   6,        6,   12 },
+          new int[]    {    5,     5,     5,   11,     11,   11,    11,   11,    11,  11,       11,   14 },
+          new int[]    {    2,     0,     0,    1,      0,    0,     0,    0,     0,   0,        0,    1 });
+    }
+  }
+  
   // LUCENE-3642
   // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
   // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
@@ -139,12 +195,15 @@ public void testRandomStrings() throws Exception {
     for (int i = 0; i < 10; i++) {
       final int min = TestUtil.nextInt(random(), 2, 10);
       final int max = TestUtil.nextInt(random(), min, 20);
+      final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+      final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+      
       Analyzer a = new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
           return new TokenStreamComponents(tokenizer, 
-              new NGramTokenFilter(tokenizer, min, max));
+              new NGramTokenFilter(tokenizer, min, max, keepShortTerm, keepLongTerm));
         }    
       };
       checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
@@ -167,27 +226,48 @@ protected TokenStreamComponents createComponents(String fieldName) {
   }
 
   public void testSupplementaryCharacters() throws IOException {
-    final String s = TestUtil.randomUnicodeString(random(), 10);
-    final int codePointCount = s.codePointCount(0, s.length());
-    final int minGram = TestUtil.nextInt(random(), 1, 3);
-    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
-    TokenStream tk = new KeywordTokenizer();
-    ((Tokenizer)tk).setReader(new StringReader(s));
-    tk = new NGramTokenFilter(tk, minGram, maxGram);
-    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
-    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
-    tk.reset();
-    for (int start = 0; start < codePointCount; ++start) {
-      for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
+    for (int i = 0; i < 20; i++) {
+      final String s = TestUtil.randomUnicodeString(random(), 10);
+      final int codePointCount = s.codePointCount(0, s.length());
+      final int minGram = TestUtil.nextInt(random(), 1, 3);
+      final int maxGram = TestUtil.nextInt(random(), minGram, 10);
+      final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+      final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+
+      TokenStream tk = new KeywordTokenizer();
+      ((Tokenizer)tk).setReader(new StringReader(s));
+      tk = new NGramTokenFilter(tk, minGram, maxGram, keepShortTerm, keepLongTerm);
+      final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
+      final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
+      tk.reset();
+
+      if (codePointCount < minGram && keepShortTerm) {
         assertTrue(tk.incrementToken());
         assertEquals(0, offsetAtt.startOffset());
         assertEquals(s.length(), offsetAtt.endOffset());
-        final int startIndex = Character.offsetByCodePoints(s, 0, start);
-        final int endIndex = Character.offsetByCodePoints(s, 0, end);
-        assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
+        assertEquals(s, termAtt.toString());
+      }
+      
+      for (int start = 0; start < codePointCount; ++start) {
+        for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
+          assertTrue(tk.incrementToken());
+          assertEquals(0, offsetAtt.startOffset());
+          assertEquals(s.length(), offsetAtt.endOffset());
+          final int startIndex = Character.offsetByCodePoints(s, 0, start);
+          final int endIndex = Character.offsetByCodePoints(s, 0, end);
+          assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
+        }
       }
+      
+      if (codePointCount > maxGram && keepLongTerm) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        assertEquals(s, termAtt.toString());
+      }
+      
+      assertFalse(tk.incrementToken());
+      tk.close();
     }
-    assertFalse(tk.incrementToken());
   }
-
 }