Fix position increment in (Reverse)PathHierarchyTokenizer (#12875)

* Fix PathHierarchyTokenizer positions PathHierarchyTokenizer was emitting multiple tokens in the same position with changing offsets. To be consistent with EdgeNGramTokenizer (which is conceptually similar -- it's emitting multiple prefixes/suffixes off the input string), we can output every token with length 1 with positions incrementing by 1. * Fix ReversePathHierarchyTokenizer positions Making ReversePathHierarchyTokenizer consistent with recent changes in PathHierarchyTokenizer. --------- Co-authored-by: Michael Froh <froh@amazon.com>
apache · Dec 18, 2023 · 5d6086e · 5d6086e
1 parent 6bb244a
commit 5d6086e
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 31 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -77,6 +77,9 @@ API Changes
 
 * GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller)
 
+* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
+  and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
+
 New Features
 ---------------------
 

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -134,6 +134,11 @@ It now declares that it may throw `IOException`. This was an oversight because
 compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
 may throw `IOException` on index problems, bubbling up unexpectedly to the caller.
 
+### PathHierarchyTokenizer and ReversePathHierarchyTokenizer do not produce overlapping tokens
+
+`(Reverse)PathHierarchyTokenizer` now produces sequential (instead of overlapping) tokens with accurate
+offsets, making positional queries and highlighters possible for fields tokenized with this tokenizer.
+
 ## Migration from Lucene 9.0 to Lucene 9.1
 
 ### Test framework package migration and module (LUCENE-10301)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
@@ -100,7 +100,8 @@ public PathHierarchyTokenizer(
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionIncrementAttribute posIncAtt =
+      addAttribute(PositionIncrementAttribute.class);
   private int startPosition = 0;
   private int skipped = 0;
   private boolean endDelimiter = false;
@@ -112,11 +113,7 @@ public PathHierarchyTokenizer(
   public final boolean incrementToken() throws IOException {
     clearAttributes();
     termAtt.append(resultToken);
-    if (resultToken.length() == 0) {
-      posAtt.setPositionIncrement(1);
-    } else {
-      posAtt.setPositionIncrement(0);
-    }
+    posIncAtt.setPositionIncrement(1);
     int length = 0;
     boolean added = false;
     if (endDelimiter) {

diff --git a/...alysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/...alysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
@@ -112,7 +112,8 @@ public ReversePathHierarchyTokenizer(
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionIncrementAttribute posIncAtt =
+      addAttribute(PositionIncrementAttribute.class);
 
   private int endPosition = 0;
   private int finalOffset = 0;
@@ -158,10 +159,8 @@ public final boolean incrementToken() throws IOException {
         endPosition = delimiterPositions.get(idx);
       }
       finalOffset = correctOffset(length);
-      posAtt.setPositionIncrement(1);
-    } else {
-      posAtt.setPositionIncrement(0);
     }
+    posIncAtt.setPositionIncrement(1);
 
     while (skipped < delimitersCount - skip - 1) {
       int start = delimiterPositions.get(skipped);

diff --git a/.../analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java b/.../analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java
@@ -19,6 +19,7 @@
 import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_DELIMITER;
 import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_SKIP;
 
+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Random;
@@ -41,7 +42,7 @@ public void testBasic() throws Exception {
         new String[] {"/a", "/a/b", "/a/b/c"},
         new int[] {0, 0, 0},
         new int[] {2, 4, 6},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -56,7 +57,7 @@ public void testEndOfDelimiter() throws Exception {
         new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"},
         new int[] {0, 0, 0, 0},
         new int[] {2, 4, 6, 7},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -71,7 +72,7 @@ public void testStartOfChar() throws Exception {
         new String[] {"a", "a/b", "a/b/c"},
         new int[] {0, 0, 0},
         new int[] {1, 3, 5},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -86,7 +87,7 @@ public void testStartOfCharEndOfDelimiter() throws Exception {
         new String[] {"a", "a/b", "a/b/c", "a/b/c/"},
         new int[] {0, 0, 0, 0},
         new int[] {1, 3, 5, 6},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -111,7 +112,7 @@ public void testOnlyDelimiters() throws Exception {
         new String[] {"/", "//"},
         new int[] {0, 0},
         new int[] {1, 2},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -125,7 +126,7 @@ public void testReplace() throws Exception {
         new String[] {"\\a", "\\a\\b", "\\a\\b\\c"},
         new int[] {0, 0, 0},
         new int[] {2, 4, 6},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -139,7 +140,7 @@ public void testWindowsPath() throws Exception {
         new String[] {"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
         new int[] {0, 0, 0, 0},
         new int[] {2, 4, 6, 8},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -158,7 +159,7 @@ public void testNormalizeWinDelimToLinuxDelim() throws Exception {
         new String[] {"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
         new int[] {0, 0, 0, 0},
         new int[] {2, 4, 6, 8},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -172,7 +173,7 @@ public void testBasicSkip() throws Exception {
         new String[] {"/b", "/b/c"},
         new int[] {2, 2},
         new int[] {4, 6},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -186,7 +187,7 @@ public void testEndOfDelimiterSkip() throws Exception {
         new String[] {"/b", "/b/c", "/b/c/"},
         new int[] {2, 2, 2},
         new int[] {4, 6, 7},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -200,7 +201,7 @@ public void testStartOfCharSkip() throws Exception {
         new String[] {"/b", "/b/c"},
         new int[] {1, 1},
         new int[] {3, 5},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -214,7 +215,7 @@ public void testStartOfCharEndOfDelimiterSkip() throws Exception {
         new String[] {"/b", "/b/c", "/b/c/"},
         new int[] {1, 1, 1},
         new int[] {3, 5, 6},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -270,4 +271,20 @@ protected TokenStreamComponents createComponents(String fieldName) {
     checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false);
     a.close();
   }
+
+  private final Analyzer analyzer =
+      new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new PathHierarchyTokenizer();
+          return new TokenStreamComponents(tokenizer);
+        }
+      };
+
+  public void testTokenizerViaAnalyzerOutput() throws IOException {
+    assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a", "a/b", "a/b/c"});
+    assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a", "a/b", "a/b/c", "a/b/c/"});
+    assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a", "/a/b", "/a/b/c"});
+    assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"});
+  }
 }
diff --git a/...is/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java b/...is/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java
@@ -19,6 +19,7 @@
 import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_DELIMITER;
 import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_SKIP;
 
+import java.io.IOException;
 import java.io.StringReader;
 import java.util.Random;
 import org.apache.lucene.analysis.Analyzer;
@@ -38,7 +39,7 @@ public void testBasicReverse() throws Exception {
         new String[] {"/a/b/c", "a/b/c", "b/c", "c"},
         new int[] {0, 1, 3, 5},
         new int[] {6, 6, 6, 6},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -53,7 +54,7 @@ public void testEndOfDelimiterReverse() throws Exception {
         new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"},
         new int[] {0, 1, 3, 5},
         new int[] {7, 7, 7, 7},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -68,7 +69,7 @@ public void testStartOfCharReverse() throws Exception {
         new String[] {"a/b/c", "b/c", "c"},
         new int[] {0, 2, 4},
         new int[] {5, 5, 5},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -83,7 +84,7 @@ public void testStartOfCharEndOfDelimiterReverse() throws Exception {
         new String[] {"a/b/c/", "b/c/", "c/"},
         new int[] {0, 2, 4},
         new int[] {6, 6, 6},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -108,7 +109,7 @@ public void testOnlyDelimitersReverse() throws Exception {
         new String[] {"//", "/"},
         new int[] {0, 1},
         new int[] {2, 2},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -124,7 +125,7 @@ public void testEndOfDelimiterReverseSkip() throws Exception {
         new String[] {"/a/b/", "a/b/", "b/"},
         new int[] {0, 1, 3},
         new int[] {5, 5, 5},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -139,7 +140,7 @@ public void testStartOfCharReverseSkip() throws Exception {
         new String[] {"a/b/", "b/"},
         new int[] {0, 2},
         new int[] {4, 4},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -154,7 +155,7 @@ public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception {
         new String[] {"a/b/", "b/"},
         new int[] {0, 2},
         new int[] {4, 4},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -189,7 +190,7 @@ public void testReverseSkip2() throws Exception {
         new String[] {"/a/", "a/"},
         new int[] {0, 1},
         new int[] {3, 3},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -227,4 +228,20 @@ protected TokenStreamComponents createComponents(String fieldName) {
     checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false);
     a.close();
   }
+
+  private final Analyzer analyzer =
+      new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new ReversePathHierarchyTokenizer();
+          return new TokenStreamComponents(tokenizer);
+        }
+      };
+
+  public void testTokenizerViaAnalyzerOutput() throws IOException {
+    assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a/b/c", "b/c", "c"});
+    assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a/b/c/", "b/c/", "c/"});
+    assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a/b/c", "a/b/c", "b/c", "c"});
+    assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"});
+  }
 }