Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix position increment in (Reverse)PathHierarchyTokenizer #12875

Merged
merged 2 commits into from Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Expand Up @@ -77,6 +77,9 @@ API Changes

* GITHUB#12855: Remove deprecated DrillSideways#createDrillDownFacetsCollector extension method. (Greg Miller)

* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)

New Features
---------------------

Expand Down
5 changes: 5 additions & 0 deletions lucene/MIGRATE.md
Expand Up @@ -134,6 +134,11 @@ It now declares that it may throw `IOException`. This was an oversight because
compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
may throw `IOException` on index problems, bubbling up unexpectedly to the caller.

### PathHierarchyTokenizer and ReversePathHierarchyTokenizer do not produce overlapping tokens

`(Reverse)PathHierarchyTokenizer` now produces sequential (instead of overlapping) tokens with accurate
offsets, making positional queries and highlighters possible for fields tokenized with this tokenizer.

## Migration from Lucene 9.0 to Lucene 9.1

### Test framework package migration and module (LUCENE-10301)
Expand Down
Expand Up @@ -100,7 +100,8 @@ public PathHierarchyTokenizer(

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionIncrementAttribute posIncAtt =
addAttribute(PositionIncrementAttribute.class);
private int startPosition = 0;
private int skipped = 0;
private boolean endDelimiter = false;
Expand All @@ -112,11 +113,7 @@ public PathHierarchyTokenizer(
public final boolean incrementToken() throws IOException {
clearAttributes();
termAtt.append(resultToken);
if (resultToken.length() == 0) {
posAtt.setPositionIncrement(1);
} else {
posAtt.setPositionIncrement(0);
}
posIncAtt.setPositionIncrement(1);
int length = 0;
boolean added = false;
if (endDelimiter) {
Expand Down
Expand Up @@ -112,7 +112,8 @@ public ReversePathHierarchyTokenizer(

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionIncrementAttribute posIncAtt =
addAttribute(PositionIncrementAttribute.class);

private int endPosition = 0;
private int finalOffset = 0;
Expand Down Expand Up @@ -158,10 +159,8 @@ public final boolean incrementToken() throws IOException {
endPosition = delimiterPositions.get(idx);
}
finalOffset = correctOffset(length);
posAtt.setPositionIncrement(1);
} else {
posAtt.setPositionIncrement(0);
}
posIncAtt.setPositionIncrement(1);

while (skipped < delimitersCount - skip - 1) {
int start = delimiterPositions.get(skipped);
Expand Down
Expand Up @@ -19,6 +19,7 @@
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_DELIMITER;
import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_SKIP;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Random;
Expand All @@ -41,7 +42,7 @@ public void testBasic() throws Exception {
new String[] {"/a", "/a/b", "/a/b/c"},
new int[] {0, 0, 0},
new int[] {2, 4, 6},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand All @@ -56,7 +57,7 @@ public void testEndOfDelimiter() throws Exception {
new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"},
new int[] {0, 0, 0, 0},
new int[] {2, 4, 6, 7},
new int[] {1, 0, 0, 0},
new int[] {1, 1, 1, 1},
path.length());
}

Expand All @@ -71,7 +72,7 @@ public void testStartOfChar() throws Exception {
new String[] {"a", "a/b", "a/b/c"},
new int[] {0, 0, 0},
new int[] {1, 3, 5},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand All @@ -86,7 +87,7 @@ public void testStartOfCharEndOfDelimiter() throws Exception {
new String[] {"a", "a/b", "a/b/c", "a/b/c/"},
new int[] {0, 0, 0, 0},
new int[] {1, 3, 5, 6},
new int[] {1, 0, 0, 0},
new int[] {1, 1, 1, 1},
path.length());
}

Expand All @@ -111,7 +112,7 @@ public void testOnlyDelimiters() throws Exception {
new String[] {"/", "//"},
new int[] {0, 0},
new int[] {1, 2},
new int[] {1, 0},
new int[] {1, 1},
path.length());
}

Expand All @@ -125,7 +126,7 @@ public void testReplace() throws Exception {
new String[] {"\\a", "\\a\\b", "\\a\\b\\c"},
new int[] {0, 0, 0},
new int[] {2, 4, 6},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand All @@ -139,7 +140,7 @@ public void testWindowsPath() throws Exception {
new String[] {"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
new int[] {0, 0, 0, 0},
new int[] {2, 4, 6, 8},
new int[] {1, 0, 0, 0},
new int[] {1, 1, 1, 1},
path.length());
}

Expand All @@ -158,7 +159,7 @@ public void testNormalizeWinDelimToLinuxDelim() throws Exception {
new String[] {"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
new int[] {0, 0, 0, 0},
new int[] {2, 4, 6, 8},
new int[] {1, 0, 0, 0},
new int[] {1, 1, 1, 1},
path.length());
}

Expand All @@ -172,7 +173,7 @@ public void testBasicSkip() throws Exception {
new String[] {"/b", "/b/c"},
new int[] {2, 2},
new int[] {4, 6},
new int[] {1, 0},
new int[] {1, 1},
path.length());
}

Expand All @@ -186,7 +187,7 @@ public void testEndOfDelimiterSkip() throws Exception {
new String[] {"/b", "/b/c", "/b/c/"},
new int[] {2, 2, 2},
new int[] {4, 6, 7},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand All @@ -200,7 +201,7 @@ public void testStartOfCharSkip() throws Exception {
new String[] {"/b", "/b/c"},
new int[] {1, 1},
new int[] {3, 5},
new int[] {1, 0},
new int[] {1, 1},
path.length());
}

Expand All @@ -214,7 +215,7 @@ public void testStartOfCharEndOfDelimiterSkip() throws Exception {
new String[] {"/b", "/b/c", "/b/c/"},
new int[] {1, 1, 1},
new int[] {3, 5, 6},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand Down Expand Up @@ -270,4 +271,20 @@ protected TokenStreamComponents createComponents(String fieldName) {
checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false);
a.close();
}

private final Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PathHierarchyTokenizer();
return new TokenStreamComponents(tokenizer);
}
};

public void testTokenizerViaAnalyzerOutput() throws IOException {
assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a", "a/b", "a/b/c"});
assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a", "a/b", "a/b/c", "a/b/c/"});
assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a", "/a/b", "/a/b/c"});
assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"});
}
}
Expand Up @@ -19,6 +19,7 @@
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_DELIMITER;
import static org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer.DEFAULT_SKIP;

import java.io.IOException;
import java.io.StringReader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
Expand All @@ -38,7 +39,7 @@ public void testBasicReverse() throws Exception {
new String[] {"/a/b/c", "a/b/c", "b/c", "c"},
new int[] {0, 1, 3, 5},
new int[] {6, 6, 6, 6},
new int[] {1, 0, 0, 0},
new int[] {1, 1, 1, 1},
path.length());
}

Expand All @@ -53,7 +54,7 @@ public void testEndOfDelimiterReverse() throws Exception {
new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"},
new int[] {0, 1, 3, 5},
new int[] {7, 7, 7, 7},
new int[] {1, 0, 0, 0},
new int[] {1, 1, 1, 1},
path.length());
}

Expand All @@ -68,7 +69,7 @@ public void testStartOfCharReverse() throws Exception {
new String[] {"a/b/c", "b/c", "c"},
new int[] {0, 2, 4},
new int[] {5, 5, 5},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand All @@ -83,7 +84,7 @@ public void testStartOfCharEndOfDelimiterReverse() throws Exception {
new String[] {"a/b/c/", "b/c/", "c/"},
new int[] {0, 2, 4},
new int[] {6, 6, 6},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand All @@ -108,7 +109,7 @@ public void testOnlyDelimitersReverse() throws Exception {
new String[] {"//", "/"},
new int[] {0, 1},
new int[] {2, 2},
new int[] {1, 0},
new int[] {1, 1},
path.length());
}

Expand All @@ -124,7 +125,7 @@ public void testEndOfDelimiterReverseSkip() throws Exception {
new String[] {"/a/b/", "a/b/", "b/"},
new int[] {0, 1, 3},
new int[] {5, 5, 5},
new int[] {1, 0, 0},
new int[] {1, 1, 1},
path.length());
}

Expand All @@ -139,7 +140,7 @@ public void testStartOfCharReverseSkip() throws Exception {
new String[] {"a/b/", "b/"},
new int[] {0, 2},
new int[] {4, 4},
new int[] {1, 0},
new int[] {1, 1},
path.length());
}

Expand All @@ -154,7 +155,7 @@ public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception {
new String[] {"a/b/", "b/"},
new int[] {0, 2},
new int[] {4, 4},
new int[] {1, 0},
new int[] {1, 1},
path.length());
}

Expand Down Expand Up @@ -189,7 +190,7 @@ public void testReverseSkip2() throws Exception {
new String[] {"/a/", "a/"},
new int[] {0, 1},
new int[] {3, 3},
new int[] {1, 0},
new int[] {1, 1},
path.length());
}

Expand Down Expand Up @@ -227,4 +228,20 @@ protected TokenStreamComponents createComponents(String fieldName) {
checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false);
a.close();
}

private final Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ReversePathHierarchyTokenizer();
return new TokenStreamComponents(tokenizer);
}
};

public void testTokenizerViaAnalyzerOutput() throws IOException {
assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a/b/c", "b/c", "c"});
assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a/b/c/", "b/c/", "c/"});
assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a/b/c", "a/b/c", "b/c", "c"});
assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a/b/c/", "a/b/c/", "b/c/", "c/"});
}
}