From 4461cbd92769700126bab45c0ce583459d2fb8b7 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Thu, 20 Apr 2017 17:35:02 -0400 Subject: [PATCH 1/2] LUCENE-7795 - illegal offsets in WordDelimiterFilter should prevent advancing start offset --- .../analysis/miscellaneous/WordDelimiterFilter.java | 11 +++-------- .../miscellaneous/TestWordDelimiterFilter.java | 2 +- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index aef697ce4ffe..637de770d8bb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -88,7 +88,7 @@ */ @Deprecated public final class WordDelimiterFilter extends TokenFilter { - + public static final int LOWER = 0x01; public static final int UPPER = 0x02; public static final int DIGIT = 0x04; @@ -504,13 +504,8 @@ private void generatePart(boolean isSingleWord) { int endOffset = savedStartOffset + iterator.end; if (hasIllegalOffsets) { - // historically this filter did this regardless for 'isSingleWord', - // but we must do a sanity check: - if (isSingleWord && startOffset <= savedEndOffset) { - offsetAttribute.setOffset(startOffset, savedEndOffset); - } else { - offsetAttribute.setOffset(savedStartOffset, savedEndOffset); - } + //Since it has illegal offsets, we don't know where we start or end in relation to term, so use old offsets. + offsetAttribute.setOffset(savedStartOffset, savedEndOffset); } else { offsetAttribute.setOffset(startOffset, endOffset); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java index 2804bfd5a07e..e4dfd87d1c2e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java @@ -95,7 +95,7 @@ public void testOffsetChange2() throws Exception { assertTokenStreamContents(wdf, new String[] { "übelkeit" }, - new int[] { 8 }, + new int[] { 7 }, new int[] { 17 }); } From 9121c7e128af5785ca2b0c3eda6dba5dcb8131d9 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Thu, 20 Apr 2017 17:41:57 -0400 Subject: [PATCH 2/2] Clean up now-unused variable --- .../lucene/analysis/miscellaneous/WordDelimiterFilter.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index 637de770d8bb..fa2f3b9a13c4 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -320,7 +320,7 @@ public boolean incrementToken() throws IOException { // word surrounded by delimiters: always output if (iterator.isSingleWord()) { - generatePart(true); + generatePart(); iterator.next(); first = false; return true; @@ -353,7 +353,7 @@ public boolean incrementToken() throws IOException { // if we should output the word or number part if (shouldGenerateParts(wordType)) { - generatePart(false); + generatePart(); buffer(); } @@ -495,9 +495,8 @@ private void concatenate(WordDelimiterConcatenation concatenation) { /** * Generates a word/number part, updating the appropriate attributes * - * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise */ - private void generatePart(boolean isSingleWord) { + private void generatePart() { clearAttributes(); termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current); int startOffset = savedStartOffset + iterator.current;