From fd9a5f67c4b8c1741dc7bc95b12bbcad2379b92c Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 16 Oct 2025 14:30:45 +0200 Subject: [PATCH 1/3] Support multiple delimiters for path tokenization This adds a new ctor for the `PathHierarchyTokenizer`, which supports an array of delimiters instead of a single one: `PathHierarchyTokenizer(AttributeFactory, int, char[], char, int)`. This allows having two different delimiters instead of a single one. Closes #15196. --- .../analysis/path/PathHierarchyTokenizer.java | 22 ++++++++++++++---- .../path/TestPathHierarchyTokenizer.java | 23 +++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index dfd727570342..7bf541fc3397 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -75,6 +75,11 @@ public PathHierarchyTokenizer(int bufferSize, char delimiter, char replacement, public PathHierarchyTokenizer( AttributeFactory factory, int bufferSize, char delimiter, char replacement, int skip) { + this(factory, bufferSize, new char[] {delimiter}, replacement, skip); + } + + public PathHierarchyTokenizer( + AttributeFactory factory, int bufferSize, char[] delimiters, char replacement, int skip) { super(factory); if (bufferSize < 0) { throw new IllegalArgumentException("bufferSize cannot be negative"); @@ -84,7 +89,7 @@ public PathHierarchyTokenizer( } termAtt.resizeBuffer(bufferSize); - this.delimiter = delimiter; + this.delimiters = delimiters; this.replacement = replacement; this.skip = skip; resultToken = new StringBuilder(bufferSize); @@ -94,7 +99,7 @@ public PathHierarchyTokenizer( public static final char DEFAULT_DELIMITER = '/'; public static final int DEFAULT_SKIP = 0; - private final char delimiter; + private final char[] delimiters; private final char replacement; private final int skip; @@ -145,13 +150,13 @@ public final boolean incrementToken() throws IOException { added = true; skipped++; if (skipped > skip) { - termAtt.append(c == delimiter ? replacement : (char) c); + termAtt.append(isDelimiterFound((char) c) ? replacement : (char) c); length++; } else { startPosition++; } } else { - if (c == delimiter) { + if (isDelimiterFound((char) c)) { if (skipped > skip) { endDelimiter = true; break; @@ -181,6 +186,15 @@ public final boolean incrementToken() throws IOException { return true; } + private boolean isDelimiterFound(char c) { + for (char delimiter : delimiters) { + if (c == delimiter) { + return true; + } + } + return false; + } + @Override public final void end() throws IOException { super.end(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java index 1662171c9bac..37bac1e79a43 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java @@ -237,6 +237,29 @@ public void testOnlyDelimitersSkip() throws Exception { t, new String[] {"/"}, new int[] {1}, new int[] {2}, new int[] {1}, path.length()); } + public void testWindowsAndLinuxPaths() throws Exception { + String path1 = "c:\\a\\b\\c"; + String path2 = "/a/b/c"; + PathHierarchyTokenizer t = + new PathHierarchyTokenizer(newAttributeFactory(), 1024, new char[] {'/', '\\'}, '/', DEFAULT_SKIP); + t.setReader(new StringReader(path1)); + assertTokenStreamContents( + t, + new String[] {"c:", "c:/a", "c:/a/b", "c:/a/b/c"}, + new int[] {0, 0, 0, 0}, + new int[] {2, 4, 6, 8}, + new int[] {1, 1, 1, 1}, + path1.length()); + t.setReader(new StringReader(path2)); + assertTokenStreamContents( + t, + new String[] {"/a", "/a/b", "/a/b/c"}, + new int[] {0, 0, 0}, + new int[] {2, 4, 6}, + new int[] {1, 1, 1}, + path2.length()); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = From 7f63f5d7c51c29afedc5701692712d03392e50bd Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 16 Oct 2025 14:57:06 +0200 Subject: [PATCH 2/3] Format code --- .../lucene/analysis/path/TestPathHierarchyTokenizer.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java index 37bac1e79a43..3b75deef1fa3 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java @@ -241,7 +241,8 @@ public void testWindowsAndLinuxPaths() throws Exception { String path1 = "c:\\a\\b\\c"; String path2 = "/a/b/c"; PathHierarchyTokenizer t = - new PathHierarchyTokenizer(newAttributeFactory(), 1024, new char[] {'/', '\\'}, '/', DEFAULT_SKIP); + new PathHierarchyTokenizer( + newAttributeFactory(), 1024, new char[] {'/', '\\'}, '/', DEFAULT_SKIP); t.setReader(new StringReader(path1)); assertTokenStreamContents( t, From 30daf9bdec8c43116e06ed9de3e88623501e4258 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 16 Oct 2025 14:58:50 +0200 Subject: [PATCH 3/3] Add PR to CHANGES.txt --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8ba40ef88d06..b4a88a8fd10a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,6 +7,8 @@ http://s.apache.org/luceneversions API Changes --------------------- +* GITHUB#15340: Support multiple delimiters for path tokenization. (David Pilato) + * GITHUB#15215: Switch to Java 25 as the minimum required platform. Upgrade to gradle 9.1.0. (Robert Muir, Kaival Parikh, Dawid Weiss)