From ca80f34ec0c60a8a01a687f670b25283982e75be Mon Sep 17 00:00:00 2001 From: Ingomar Wesp Date: Fri, 30 Mar 2018 23:40:17 +0200 Subject: [PATCH] N-Gram filters: Add options to keep original terms. Adds the following properties to EdgeNGramTokenFilter & NGramTokenFilter: - keepShortTerm: Don't drop input terms smaller than minGramSize. - keepLongTerm: Don't drop input terms longer than maxGramSize. --- .../ngram/EdgeNGramFilterFactory.java | 6 +- .../analysis/ngram/EdgeNGramTokenFilter.java | 77 +++++++---- .../analysis/ngram/NGramFilterFactory.java | 6 +- .../analysis/ngram/NGramTokenFilter.java | 87 ++++++++----- .../ngram/EdgeNGramTokenFilterTest.java | 120 ++++++++++++++---- .../analysis/ngram/NGramTokenFilterTest.java | 120 +++++++++++++++--- 6 files changed, 316 insertions(+), 100 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java index 020b85bb5e92..6bc830a3ca4e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java @@ -36,12 +36,16 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean keepShortTerm; + private final boolean keepLongTerm; /** Creates a new EdgeNGramFilterFactory */ public EdgeNGramFilterFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + keepShortTerm = getBoolean(args, "keepShortTerm", EdgeNGramTokenFilter.DEFAULT_KEEP_SHORT_TERM); + keepLongTerm = getBoolean(args, "keepLongTerm", EdgeNGramTokenFilter.DEFAULT_KEEP_LONG_TERM); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +53,6 @@ public EdgeNGramFilterFactory(Map args) { @Override public TokenFilter create(TokenStream input) { - return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize); + return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, keepShortTerm, keepLongTerm); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 56efd897d178..08013da99f88 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -34,27 +34,35 @@ public final class EdgeNGramTokenFilter extends TokenFilter { public static final int DEFAULT_MAX_GRAM_SIZE = 1; public static final int DEFAULT_MIN_GRAM_SIZE = 1; + public static final boolean DEFAULT_KEEP_SHORT_TERM = false; + public static final boolean DEFAULT_KEEP_LONG_TERM = false; private final int minGram; private final int maxGram; + private final boolean keepShortTerm; + private final boolean keepLongTerm; + private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; - private int savePosIncr; + private int curPosIncr; private State state; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncrAtt; /** - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range + * Creates EdgeNGramTokenFilter that generates edge n-grams of sizes in the given range. * * @param input {@link TokenStream} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate + * @param keepShortTerm whether to pass through tokens that are shorter than minGram + * @param keepLongTerm whether to pass through tokens that are longer than maxGram */ - public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { + public EdgeNGramTokenFilter( + TokenStream input, int minGram, int maxGram, boolean keepShortTerm, boolean keepLongTerm) { super(input); if (minGram < 1) { @@ -67,6 +75,15 @@ public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { this.minGram = minGram; this.maxGram = maxGram; + this.keepShortTerm = keepShortTerm; + this.keepLongTerm = keepLongTerm; + + this.termAtt = addAttribute(CharTermAttribute.class); + this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); + } + + public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { + this(input, minGram, maxGram, DEFAULT_KEEP_SHORT_TERM, DEFAULT_KEEP_LONG_TERM); } @Override @@ -75,32 +92,46 @@ public final boolean incrementToken() throws IOException { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - state = captureState(); - savePosIncr += posIncrAtt.getPositionIncrement(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength); + curPosIncr += posIncrAtt.getPositionIncrement(); + + if (keepShortTerm && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit - if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams - // grab gramSize chars from front or back + + if (curGramSize <= curTermCodePointCount) { + if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram restoreState(state); // first ngram gets increment, others don't - if (curGramSize == minGram) { - posIncrAtt.setPositionIncrement(savePosIncr); - savePosIncr = 0; - } else { - posIncrAtt.setPositionIncrement(0); - } + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize); termAtt.copyBuffer(curTermBuffer, 0, charLength); curGramSize++; return true; } + else if (keepLongTerm) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } } + // Done with this input token, get next token on the next iteration. curTermBuffer = null; } } @@ -109,6 +140,6 @@ public final boolean incrementToken() throws IOException { public void reset() throws IOException { super.reset(); curTermBuffer = null; - savePosIncr = 0; + curPosIncr = 0; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java index 2064716b78b7..60165be8b6ad 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java @@ -36,12 +36,16 @@ public class NGramFilterFactory extends TokenFilterFactory { private final int maxGramSize; private final int minGramSize; + private final boolean keepShortTerm; + private final boolean keepLongTerm; /** Creates a new NGramFilterFactory */ public NGramFilterFactory(Map args) { super(args); minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + keepShortTerm = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_KEEP_SHORT_TERM); + keepLongTerm = getBoolean(args, "keepLongTerm", NGramTokenFilter.DEFAULT_KEEP_LONG_TERM); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -49,6 +53,6 @@ public NGramFilterFactory(Map args) { @Override public TokenFilter create(TokenStream input) { - return new NGramTokenFilter(input, minGramSize, maxGramSize); + return new NGramTokenFilter(input, minGramSize, maxGramSize, keepShortTerm, keepLongTerm); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index a2e0aa7e5884..fb21e30d16f2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -21,7 +21,6 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -42,28 +41,27 @@ public final class NGramTokenFilter extends TokenFilter { public static final int DEFAULT_MIN_NGRAM_SIZE = 1; public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + public static final boolean DEFAULT_KEEP_SHORT_TERM = false; + public static final boolean DEFAULT_KEEP_LONG_TERM = false; - private final int minGram, maxGram; + private final int minGram; + private final int maxGram; + private final boolean keepShortTerm; + private final boolean keepLongTerm; private char[] curTermBuffer; private int curTermLength; - private int curCodePointCount; + private int curTermCodePointCount; private int curGramSize; private int curPos; - private int curPosInc; + private int curPosIncr; private State state; - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncAtt; + private final CharTermAttribute termAtt; + private final PositionIncrementAttribute posIncrAtt; - /** - * Creates NGramTokenFilter with given min and max n-grams. - * @param input {@link TokenStream} holding the input to be tokenized - * @param minGram the smallest n-gram to generate - * @param maxGram the largest n-gram to generate - */ - public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { - super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE)); + public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean keepShortTerm, boolean keepLongTerm) { + super(input); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } @@ -72,8 +70,21 @@ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { } this.minGram = minGram; this.maxGram = maxGram; + this.keepShortTerm = keepShortTerm; + this.keepLongTerm = keepLongTerm; - posIncAtt = addAttribute(PositionIncrementAttribute.class); + this.termAtt = addAttribute(CharTermAttribute.class); + this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); + } + + /** + * Creates NGramTokenFilter with given min and max n-grams. + * @param input {@link TokenStream} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public NGramTokenFilter(TokenStream input, int minGram, int maxGram) { + this(input, minGram, maxGram, DEFAULT_KEEP_SHORT_TERM, DEFAULT_KEEP_LONG_TERM); } /** @@ -84,39 +95,56 @@ public NGramTokenFilter(TokenStream input) { this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); } - /** Returns the next token in the stream, or null at EOS. */ @Override public final boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; - } else { - curTermBuffer = termAtt.buffer().clone(); - curTermLength = termAtt.length(); - curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); - curGramSize = minGram; - curPos = 0; - curPosInc = posIncAtt.getPositionIncrement(); - state = captureState(); } + state = captureState(); + + curTermLength = termAtt.length(); + curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length()); + curPosIncr += posIncrAtt.getPositionIncrement(); + curPos = 0; + + if (keepShortTerm && curTermCodePointCount < minGram) { + // Token is shorter than minGram, but we'd still like to keep it. + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; + return true; + } + + curTermBuffer = termAtt.buffer().clone(); + curGramSize = minGram; } - if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { + if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) { ++curPos; curGramSize = minGram; } - if ((curPos + curGramSize) <= curCodePointCount) { + if ((curPos + curGramSize) <= curTermCodePointCount) { restoreState(state); final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.copyBuffer(curTermBuffer, start, end - start); - posIncAtt.setPositionIncrement(curPosInc); - curPosInc = 0; + posIncrAtt.setPositionIncrement(curPosIncr); + curPosIncr = 0; curGramSize++; return true; } - curTermBuffer = null; + else if (keepLongTerm && curTermCodePointCount > maxGram) { + // Token is longer than maxGram, but we'd still like to keep it. + restoreState(state); + posIncrAtt.setPositionIncrement(0); + termAtt.copyBuffer(curTermBuffer, 0, curTermLength); + curTermBuffer = null; + return true; + } + + // Done with this input token, get next token on next iteration. + curTermBuffer = null; } } @@ -124,5 +152,6 @@ public final boolean incrementToken() throws IOException { public void reset() throws IOException { super.reset(); curTermBuffer = null; + curPosIncr = 0; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java index d7536e7050f3..b4bf6b33ea82 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java @@ -76,6 +76,55 @@ public void testOversizedNgrams() throws Exception { assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsKeepShortTerm() throws Exception { + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true, false); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + + public void testKeepShortTermKeepLongTerm() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // default behaviour + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "ef", "efg", "jk" }, + new int[] { 2, 2, 6, 6, 12 }, + new int[] { 5, 5, 11, 11, 14 }, + new int[] { 2, 0, 1, 0, 1 }); + } + + { // keepShortTerm && keepLongTerm + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "ef", "efg", "efghi", "jk" }, + new int[] { 0, 2, 2, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 1, 0, 0, 1 }); + } + + { // keepShortTerm && !keepLongTerm + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true, false); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "ef", "efg", "jk" }, + new int[] { 0, 2, 2, 6, 6, 12 }, + new int[] { 1, 5, 5, 11, 11, 14 }, + new int[] { 1, 1, 0, 1, 0, 1 }); + } + + { // !keepShortTerm && keepLongTerm + TokenStream ts = whitespaceMockTokenizer(inputString); + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false, true); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "ef", "efg", "efghi", "jk" }, + new int[] { 2, 2, 6, 6, 6, 12 }, + new int[] { 5, 5, 11, 11, 11, 14 }, + new int[] { 2, 0, 1, 0, 0, 1 }); + } + } + public void testFrontRangeOfNgrams() throws Exception { EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3); assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); @@ -85,14 +134,9 @@ public void testFilterPositions() throws Exception { TokenStream ts = whitespaceMockTokenizer("abcde vwxyz"); EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3); assertTokenStreamContents(tokenizer, - new String[]{"a","ab","abc","v","vw","vwx"}, - new int[]{0,0,0,6,6,6}, - new int[]{5,5,5,11,11,11}, - null, - new int[]{1,0,0,1,0,0}, - null, - null, - false); + new String[] {"a","ab","abc","v","vw","vwx"}, + new int[] {0, 0, 0, 6, 6, 6}, + new int[] {5, 5, 5, 11, 11, 11}); } private static class PositionFilter extends TokenFilter { @@ -160,13 +204,15 @@ public void testRandomStrings() throws Exception { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new EdgeNGramTokenFilter(tokenizer, min, max)); + new EdgeNGramTokenFilter(tokenizer, min, max, keepShortTerm, keepLongTerm)); } }; checkRandomData(random(), a, 100*RANDOM_MULTIPLIER); @@ -204,23 +250,45 @@ public void testGraphs() throws IOException { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new EdgeNGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) { - assertTrue(tk.incrementToken()); - assertEquals(0, offsetAtt.startOffset()); - assertEquals(s.length(), offsetAtt.endOffset()); - final int end = Character.offsetByCodePoints(s, 0, i); - assertEquals(s.substring(0, end), termAtt.toString()); + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, keepShortTerm, keepLongTerm); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && keepShortTerm) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int end = Character.offsetByCodePoints(s, 0, j); + assertEquals(s.substring(0, end), termAtt.toString()); + } + + if (codePointCount > maxGram && keepLongTerm) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java index d8591a9726ec..3c1bed1b8015 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java @@ -97,10 +97,22 @@ public void testOversizedNgrams() throws Exception { assertTokenStreamContents(filter, new String[0], new int[0], new int[0]); } + public void testOversizedNgramsKeepShortTerm() throws Exception { + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true, false); + assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5}); + } + public void testSmallTokenInStream() throws Exception { input = whitespaceMockTokenizer("abc de fgh"); - NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3); - assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3); + assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2}); + } + + public void testSmallTokenInStreamKeepShortTerm() throws Exception { + input = whitespaceMockTokenizer("abc de fgh"); + NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true, false); + assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1}); + } public void testReset() throws Exception { @@ -112,6 +124,50 @@ public void testReset() throws Exception { assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); } + public void testKeepShortTermKeepLongTerm() throws Exception { + final String inputString = "a bcd efghi jk"; + + { // default behaviour + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" }, + new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 }); + } + + { // keepShortTerm && keepLongTerm + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true, true); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" }, + new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }); + } + + { // keepShortTerm && !keepLongTerm + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true, false); + assertTokenStreamContents(filter, + new String[] { "a", "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" }, + new int[] { 0, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 1, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 }); + } + + { // !keepShortTerm && keepLongTerm + TokenStream ts = whitespaceMockTokenizer(inputString); + NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false, true); + assertTokenStreamContents(filter, + new String[] { "bc", "bcd", "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" }, + new int[] { 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 12 }, + new int[] { 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 14 }, + new int[] { 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }); + } + } + // LUCENE-3642 // EdgeNgram blindly adds term length to offset, but this can take things out of bounds // wrt original text if a previous filter increases the length of the word (in this case æ -> ae) @@ -139,12 +195,15 @@ public void testRandomStrings() throws Exception { for (int i = 0; i < 10; i++) { final int min = TestUtil.nextInt(random(), 2, 10); final int max = TestUtil.nextInt(random(), min, 20); + final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, - new NGramTokenFilter(tokenizer, min, max)); + new NGramTokenFilter(tokenizer, min, max, keepShortTerm, keepLongTerm)); } }; checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20); @@ -167,27 +226,48 @@ protected TokenStreamComponents createComponents(String fieldName) { } public void testSupplementaryCharacters() throws IOException { - final String s = TestUtil.randomUnicodeString(random(), 10); - final int codePointCount = s.codePointCount(0, s.length()); - final int minGram = TestUtil.nextInt(random(), 1, 3); - final int maxGram = TestUtil.nextInt(random(), minGram, 10); - TokenStream tk = new KeywordTokenizer(); - ((Tokenizer)tk).setReader(new StringReader(s)); - tk = new NGramTokenFilter(tk, minGram, maxGram); - final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); - final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); - tk.reset(); - for (int start = 0; start < codePointCount; ++start) { - for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + for (int i = 0; i < 20; i++) { + final String s = TestUtil.randomUnicodeString(random(), 10); + final int codePointCount = s.codePointCount(0, s.length()); + final int minGram = TestUtil.nextInt(random(), 1, 3); + final int maxGram = TestUtil.nextInt(random(), minGram, 10); + final boolean keepShortTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + final boolean keepLongTerm = TestUtil.nextInt(random(), 0, 1) % 2 == 0; + + TokenStream tk = new KeywordTokenizer(); + ((Tokenizer)tk).setReader(new StringReader(s)); + tk = new NGramTokenFilter(tk, minGram, maxGram, keepShortTerm, keepLongTerm); + final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + + if (codePointCount < minGram && keepShortTerm) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); - final int startIndex = Character.offsetByCodePoints(s, 0, start); - final int endIndex = Character.offsetByCodePoints(s, 0, end); - assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + assertEquals(s, termAtt.toString()); + } + + for (int start = 0; start < codePointCount; ++start) { + for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + final int startIndex = Character.offsetByCodePoints(s, 0, start); + final int endIndex = Character.offsetByCodePoints(s, 0, end); + assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); + } } + + if (codePointCount > maxGram && keepLongTerm) { + assertTrue(tk.incrementToken()); + assertEquals(0, offsetAtt.startOffset()); + assertEquals(s.length(), offsetAtt.endOffset()); + assertEquals(s, termAtt.toString()); + } + + assertFalse(tk.incrementToken()); + tk.close(); } - assertFalse(tk.incrementToken()); } - }