diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8ba40ef88d06..b4a88a8fd10a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,6 +7,8 @@ http://s.apache.org/luceneversions API Changes --------------------- +* GITHUB#15340: Support multiple delimiters for path tokenization. (David Pilato) + * GITHUB#15215: Switch to Java 25 as the minimum required platform. Upgrade to gradle 9.1.0. (Robert Muir, Kaival Parikh, Dawid Weiss) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index dfd727570342..7bf541fc3397 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -75,6 +75,11 @@ public PathHierarchyTokenizer(int bufferSize, char delimiter, char replacement, public PathHierarchyTokenizer( AttributeFactory factory, int bufferSize, char delimiter, char replacement, int skip) { + this(factory, bufferSize, new char[] {delimiter}, replacement, skip); + } + + public PathHierarchyTokenizer( + AttributeFactory factory, int bufferSize, char[] delimiters, char replacement, int skip) { super(factory); if (bufferSize < 0) { throw new IllegalArgumentException("bufferSize cannot be negative"); @@ -84,7 +89,7 @@ public PathHierarchyTokenizer( } termAtt.resizeBuffer(bufferSize); - this.delimiter = delimiter; + this.delimiters = delimiters; this.replacement = replacement; this.skip = skip; resultToken = new StringBuilder(bufferSize); @@ -94,7 +99,7 @@ public PathHierarchyTokenizer( public static final char DEFAULT_DELIMITER = '/'; public static final int DEFAULT_SKIP = 0; - private final char delimiter; + private final char[] delimiters; private final char replacement; private final int skip; @@ -145,13 +150,13 @@ public final boolean incrementToken() throws IOException { added = true; skipped++; if (skipped > skip) { - termAtt.append(c == delimiter ? replacement : (char) c); + termAtt.append(isDelimiterFound((char) c) ? replacement : (char) c); length++; } else { startPosition++; } } else { - if (c == delimiter) { + if (isDelimiterFound((char) c)) { if (skipped > skip) { endDelimiter = true; break; @@ -181,6 +186,15 @@ public final boolean incrementToken() throws IOException { return true; } + private boolean isDelimiterFound(char c) { + for (char delimiter : delimiters) { + if (c == delimiter) { + return true; + } + } + return false; + } + @Override public final void end() throws IOException { super.end(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java index 1662171c9bac..3b75deef1fa3 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java @@ -237,6 +237,30 @@ public void testOnlyDelimitersSkip() throws Exception { t, new String[] {"/"}, new int[] {1}, new int[] {2}, new int[] {1}, path.length()); } + public void testWindowsAndLinuxPaths() throws Exception { + String path1 = "c:\\a\\b\\c"; + String path2 = "/a/b/c"; + PathHierarchyTokenizer t = + new PathHierarchyTokenizer( + newAttributeFactory(), 1024, new char[] {'/', '\\'}, '/', DEFAULT_SKIP); + t.setReader(new StringReader(path1)); + assertTokenStreamContents( + t, + new String[] {"c:", "c:/a", "c:/a/b", "c:/a/b/c"}, + new int[] {0, 0, 0, 0}, + new int[] {2, 4, 6, 8}, + new int[] {1, 1, 1, 1}, + path1.length()); + t.setReader(new StringReader(path2)); + assertTokenStreamContents( + t, + new String[] {"/a", "/a/b", "/a/b/c"}, + new int[] {0, 0, 0}, + new int[] {2, 4, 6}, + new int[] {1, 1, 1}, + path2.length()); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a =