From 5d6086e1994d766a3dd39a47b14a8cd80a7280e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Vl=C4=8Dek?= Date: Mon, 18 Dec 2023 14:48:22 +0100 Subject: [PATCH] Fix position increment in (Reverse)PathHierarchyTokenizer (#12875) * Fix PathHierarchyTokenizer positions PathHierarchyTokenizer was emitting multiple tokens in the same position with changing offsets. To be consistent with EdgeNGramTokenizer (which is conceptually similar -- it's emitting multiple prefixes/suffixes off the input string), we can output every token with length 1 with positions incrementing by 1. * Fix ReversePathHierarchyTokenizer positions Making ReversePathHierarchyTokenizer consistent with recent changes in PathHierarchyTokenizer. --------- Co-authored-by: Michael Froh --- lucene/CHANGES.txt | 3 ++ lucene/MIGRATE.md | 5 +++ .../analysis/path/PathHierarchyTokenizer.java | 9 ++-- .../path/ReversePathHierarchyTokenizer.java | 7 ++-- .../path/TestPathHierarchyTokenizer.java | 41 +++++++++++++------ .../TestReversePathHierarchyTokenizer.java | 35 ++++++++++++---- 6 files changed, 69 insertions(+), 31 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 223c10c0f999..da20a8559bab 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -77,6 +77,9 @@ API Changes * GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller) +* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer + and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček) + New Features --------------------- diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 348a9d2e84ea..75f7c5b4eeba 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -134,6 +134,11 @@ It now declares that it may throw `IOException`. This was an oversight because compiled expressions call `DoubleValues#doubleValue` behind the scenes, which may throw `IOException` on index problems, bubbling up unexpectedly to the caller. +### PathHierarchyTokenizer and ReversePathHierarchyTokenizer do not produce overlapping tokens + +`(Reverse)PathHierarchyTokenizer` now produces sequential (instead of overlapping) tokens with accurate +offsets, making positional queries and highlighters possible for fields tokenized with this tokenizer. + ## Migration from Lucene 9.0 to Lucene 9.1 ### Test framework package migration and module (LUCENE-10301) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index e7dfa320b6b3..dfd727570342 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -100,7 +100,8 @@ public PathHierarchyTokenizer( private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionIncrementAttribute posIncAtt = + addAttribute(PositionIncrementAttribute.class); private int startPosition = 0; private int skipped = 0; private boolean endDelimiter = false; @@ -112,11 +113,7 @@ public PathHierarchyTokenizer( public final boolean incrementToken() throws IOException { clearAttributes(); termAtt.append(resultToken); - if (resultToken.length() == 0) { - posAtt.setPositionIncrement(1); - } else { - posAtt.setPositionIncrement(0); - } + posIncAtt.setPositionIncrement(1); int length = 0; boolean added = false; if (endDelimiter) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java index 7b1f60f51c4b..b3cb2bbd3ae5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java @@ -112,7 +112,8 @@ public ReversePathHierarchyTokenizer( private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionIncrementAttribute posIncAtt = + addAttribute(PositionIncrementAttribute.class); private int endPosition = 0; private int finalOffset = 0; @@ -158,10 +159,8 @@ public final boolean incrementToken() throws IOException { endPosition = delimiterPositions.get(idx); } finalOffset = correctOffset(length); - posAtt.setPositionIncrement(1); - } else { - posAtt.setPositionIncrement(0); } + posIncAtt.setPositionIncrement(1); while (skipped < delimitersCount - skip - 1) { int start = delimiterPositions.get(skipped); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java index 238319be73e7..1662171c9bac 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java @@ -19,6 +19,7 @@ import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_DELIMITER; import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_SKIP; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Random; @@ -41,7 +42,7 @@ public void testBasic() throws Exception { new String[] {"/a", "/a/b", "/a/b/c"}, new int[] {0, 0, 0}, new int[] {2, 4, 6}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -56,7 +57,7 @@ public void testEndOfDelimiter() throws Exception { new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"}, new int[] {0, 0, 0, 0}, new int[] {2, 4, 6, 7}, - new int[] {1, 0, 0, 0}, + new int[] {1, 1, 1, 1}, path.length()); } @@ -71,7 +72,7 @@ public void testStartOfChar() throws Exception { new String[] {"a", "a/b", "a/b/c"}, new int[] {0, 0, 0}, new int[] {1, 3, 5}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -86,7 +87,7 @@ public void testStartOfCharEndOfDelimiter() throws Exception { new String[] {"a", "a/b", "a/b/c", "a/b/c/"}, new int[] {0, 0, 0, 0}, new int[] {1, 3, 5, 6}, - new int[] {1, 0, 0, 0}, + new int[] {1, 1, 1, 1}, path.length()); } @@ -111,7 +112,7 @@ public void testOnlyDelimiters() throws Exception { new String[] {"/", "//"}, new int[] {0, 0}, new int[] {1, 2}, - new int[] {1, 0}, + new int[] {1, 1}, path.length()); } @@ -125,7 +126,7 @@ public void testReplace() throws Exception { new String[] {"\\a", "\\a\\b", "\\a\\b\\c"}, new int[] {0, 0, 0}, new int[] {2, 4, 6}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -139,7 +140,7 @@ public void testWindowsPath() throws Exception { new String[] {"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"}, new int[] {0, 0, 0, 0}, new int[] {2, 4, 6, 8}, - new int[] {1, 0, 0, 0}, + new int[] {1, 1, 1, 1}, path.length()); } @@ -158,7 +159,7 @@ public void testNormalizeWinDelimToLinuxDelim() throws Exception { new String[] {"c:", "c:/a", "c:/a/b", "c:/a/b/c"}, new int[] {0, 0, 0, 0}, new int[] {2, 4, 6, 8}, - new int[] {1, 0, 0, 0}, + new int[] {1, 1, 1, 1}, path.length()); } @@ -172,7 +173,7 @@ public void testBasicSkip() throws Exception { new String[] {"/b", "/b/c"}, new int[] {2, 2}, new int[] {4, 6}, - new int[] {1, 0}, + new int[] {1, 1}, path.length()); } @@ -186,7 +187,7 @@ public void testEndOfDelimiterSkip() throws Exception { new String[] {"/b", "/b/c", "/b/c/"}, new int[] {2, 2, 2}, new int[] {4, 6, 7}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -200,7 +201,7 @@ public void testStartOfCharSkip() throws Exception { new String[] {"/b", "/b/c"}, new int[] {1, 1}, new int[] {3, 5}, - new int[] {1, 0}, + new int[] {1, 1}, path.length()); } @@ -214,7 +215,7 @@ public void testStartOfCharEndOfDelimiterSkip() throws Exception { new String[] {"/b", "/b/c", "/b/c/"}, new int[] {1, 1, 1}, new int[] {3, 5, 6}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -270,4 +271,20 @@ protected TokenStreamComponents createComponents(String fieldName) { checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false); a.close(); } + + private final Analyzer analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new PathHierarchyTokenizer(); + return new TokenStreamComponents(tokenizer); + } + }; + + public void testTokenizerViaAnalyzerOutput() throws IOException { + assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a", "a/b", "a/b/c"}); + assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a", "a/b", "a/b/c", "a/b/c/"}); + assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a", "/a/b", "/a/b/c"}); + assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"}); + } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java index 683faf72f9cf..c23f954cea0e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java @@ -19,6 +19,7 @@ import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_DELIMITER; import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_SKIP; +import java.io.IOException; import java.io.StringReader; import java.util.Random; import org.apache.lucene.analysis.Analyzer; @@ -38,7 +39,7 @@ public void testBasicReverse() throws Exception { new String[] {"/a/b/c", "a/b/c", "b/c", "c"}, new int[] {0, 1, 3, 5}, new int[] {6, 6, 6, 6}, - new int[] {1, 0, 0, 0}, + new int[] {1, 1, 1, 1}, path.length()); } @@ -53,7 +54,7 @@ public void testEndOfDelimiterReverse() throws Exception { new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"}, new int[] {0, 1, 3, 5}, new int[] {7, 7, 7, 7}, - new int[] {1, 0, 0, 0}, + new int[] {1, 1, 1, 1}, path.length()); } @@ -68,7 +69,7 @@ public void testStartOfCharReverse() throws Exception { new String[] {"a/b/c", "b/c", "c"}, new int[] {0, 2, 4}, new int[] {5, 5, 5}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -83,7 +84,7 @@ public void testStartOfCharEndOfDelimiterReverse() throws Exception { new String[] {"a/b/c/", "b/c/", "c/"}, new int[] {0, 2, 4}, new int[] {6, 6, 6}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -108,7 +109,7 @@ public void testOnlyDelimitersReverse() throws Exception { new String[] {"//", "/"}, new int[] {0, 1}, new int[] {2, 2}, - new int[] {1, 0}, + new int[] {1, 1}, path.length()); } @@ -124,7 +125,7 @@ public void testEndOfDelimiterReverseSkip() throws Exception { new String[] {"/a/b/", "a/b/", "b/"}, new int[] {0, 1, 3}, new int[] {5, 5, 5}, - new int[] {1, 0, 0}, + new int[] {1, 1, 1}, path.length()); } @@ -139,7 +140,7 @@ public void testStartOfCharReverseSkip() throws Exception { new String[] {"a/b/", "b/"}, new int[] {0, 2}, new int[] {4, 4}, - new int[] {1, 0}, + new int[] {1, 1}, path.length()); } @@ -154,7 +155,7 @@ public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception { new String[] {"a/b/", "b/"}, new int[] {0, 2}, new int[] {4, 4}, - new int[] {1, 0}, + new int[] {1, 1}, path.length()); } @@ -189,7 +190,7 @@ public void testReverseSkip2() throws Exception { new String[] {"/a/", "a/"}, new int[] {0, 1}, new int[] {3, 3}, - new int[] {1, 0}, + new int[] {1, 1}, path.length()); } @@ -227,4 +228,20 @@ protected TokenStreamComponents createComponents(String fieldName) { checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false); a.close(); } + + private final Analyzer analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new ReversePathHierarchyTokenizer(); + return new TokenStreamComponents(tokenizer); + } + }; + + public void testTokenizerViaAnalyzerOutput() throws IOException { + assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a/b/c", "b/c", "c"}); + assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a/b/c/", "b/c/", "c/"}); + assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a/b/c", "a/b/c", "b/c", "c"}); + assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"}); + } }