From da66135f10a4eb0e6edb4d27f07331044ae971d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=A1s=20Lichtmaier?= Date: Fri, 11 Aug 2017 17:19:36 -0300 Subject: [PATCH] Added a new camelCase option to WordDelimiterGraphFilter that enables better support for camel case. Before WordDelimiterGraphFilter only looked at pairs of characters, now (if enabled) it will split when an upper case character is followed by another upper case character and then there's a lower case character. Like this in HTTPRequest: 'P' 'R' 'e' It will also break the word when a lower case character is followed by a number. With this flag these splits happen: HTTPRequest -> HTTP Request 3DPlot -> 3D Plot Plot3D -> Plot 3D --- .../miscellaneous/WordDelimiterFilter.java | 9 +++- .../WordDelimiterFilterFactory.java | 3 ++ .../WordDelimiterGraphFilter.java | 18 +++++++- .../WordDelimiterGraphFilterFactory.java | 3 ++ .../miscellaneous/WordDelimiterIterator.java | 41 +++++++++++++++---- .../TestWordDelimiterGraphFilter.java | 15 +++++++ 6 files changed, 77 insertions(+), 12 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index aef697ce4ffe..9bb0762c8808 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -158,6 +158,13 @@ public final class WordDelimiterFilter extends TokenFilter { * "O'Neil's" => "O", "Neil" */ public static final int STEM_ENGLISH_POSSESSIVE = 256; + + /** + * Suport camel case better. + *

+ * "HTTPRequest" => "HTTP", "Request" + */ + public static final int CAMEL_CASE = 512; /** * If not null is the set of tokens to protect from being delimited @@ -214,7 +221,7 @@ public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurati this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator( - charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); + charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE), has(CAMEL_CASE)); } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java index 0002d65331c9..47f77d31eb88 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java @@ -95,6 +95,9 @@ public WordDelimiterFilterFactory(Map args) { if (getInt(args, "stemEnglishPossessive", 1) != 0) { flags |= STEM_ENGLISH_POSSESSIVE; } + if (getInt(args, "camelCase", 1) != 0) { + flags |= CAMEL_CASE; + } wordFiles = get(args, PROTECTED_TOKENS); types = get(args, TYPES); this.flags = flags; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index a6ade199545c..59706318f287 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -151,6 +151,13 @@ public final class WordDelimiterGraphFilter extends TokenFilter { * "O'Neil's" => "O", "Neil" */ public static final int STEM_ENGLISH_POSSESSIVE = 256; + + /** + * Suport camel case better. + *

+ * "HTTPRequest" => "HTTP", "Request" + */ + public static final int CAMEL_CASE = 512; /** * If not null is the set of tokens to protect from being delimited @@ -220,13 +227,14 @@ public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int config PRESERVE_ORIGINAL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | - STEM_ENGLISH_POSSESSIVE)) != 0) { + STEM_ENGLISH_POSSESSIVE | + CAMEL_CASE)) != 0) { throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags); } this.flags = configurationFlags; this.protWords = protWords; this.iterator = new WordDelimiterIterator( - charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); + charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE), has(CAMEL_CASE)); } /** @@ -683,6 +691,12 @@ public static String flagsToString(int flags) { } b.append("STEM_ENGLISH_POSSESSIVE"); } + if ((flags & CAMEL_CASE) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("CAMEL_CASE"); + } return b.toString(); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java index a06cc7502d73..0cf57c12e11d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java @@ -90,6 +90,9 @@ public WordDelimiterGraphFilterFactory(Map args) { if (getInt(args, "stemEnglishPossessive", 1) != 0) { flags |= STEM_ENGLISH_POSSESSIVE; } + if (getInt(args, "camelCase", 1) != 0) { + flags |= CAMEL_CASE; + } wordFiles = get(args, PROTECTED_TOKENS); types = get(args, TYPES); this.flags = flags; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java index 86b983d8b060..f06a30c956db 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java @@ -71,6 +71,12 @@ public final class WordDelimiterIterator { */ final boolean stemEnglishPossessive; + /** If true word splitting will try to suport camel case better. + *

+ * "HTTPRequest" => "HTTP", "Request" + */ + final boolean camelCase; + private final byte[] charTypeTable; /** if true, need to skip over a possessive found in the last call to next() */ @@ -106,12 +112,14 @@ else if (Character.isDigit(i)) { * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless) * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se" * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" + * @param camelCase word splitting will try to suport camel case better: "HTTPRequest" => "HTTP", "Request" */ - WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) { + WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive, boolean camelCase) { this.charTypeTable = charTypeTable; this.splitOnCaseChange = splitOnCaseChange; this.splitOnNumerics = splitOnNumerics; this.stemEnglishPossessive = stemEnglishPossessive; + this.camelCase = camelCase; } /** @@ -139,13 +147,15 @@ int next() { if (current >= endBounds) { return end = DONE; } - + + int type = (current + 1) < endBounds ? charType(text[current + 1]) : 0; for (end = current + 1; end < endBounds; end++) { - int type = charType(text[end]); - if (isBreak(lastType, type)) { + int nextType = (end+1) < endBounds ? charType(text[end + 1]) : 0; + if (isBreak(lastType, type, nextType)) { break; } lastType = type; + type = nextType; } if (end < endBounds - 1 && endsWithPossessive(end + 2)) { @@ -199,19 +209,23 @@ void setText(char text[], int length) { * * @param lastType Last subword type * @param type Current subword type + * @param nextType Following subword type * @return {@code true} if the transition indicates a break, {@code false} otherwise */ - private boolean isBreak(int lastType, int type) { - if ((type & lastType) != 0) { + private boolean isBreak(int lastType, int type, int nextType) { + if ((!camelCase || !isUpper(type)) && (type & lastType) != 0) { return false; } - if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) { + if (!(splitOnCaseChange || camelCase) && isAlpha(lastType) && isAlpha(type)) { // ALPHA->ALPHA: always ignore if case isn't considered. return false; } else if (isUpper(lastType) && isAlpha(type)) { - // UPPER->letter: Don't split - return false; + // UPPER->letter: Don't split unless camelCase is on and it's the case UPPER->UPPER ( -> LOWER ) + return !camelCase ? false : (isUpper(type) && isLower(nextType)); + } else if(camelCase && isLower(lastType) && isDigit(type)) { + // when camelCase is on, split on LOWER -> DIGIT + return true; } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split return false; @@ -370,4 +384,13 @@ static boolean isUpper(int type) { return (type & UPPER) != 0; } + /** + * Checks if the given word type includes {@link #LOWER} + * + * @param type Word type to check + * @return {@code true} if the type contains LOWER, {@code false} otherwise + */ + static boolean isLower(int type) { + return (type & LOWER) != 0; + } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java index 7516a23fd65c..0996176db68a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java @@ -721,6 +721,21 @@ public void testBasicGraphSplits() throws Exception { "PowerShot 100017 Plus"); } + public void testCamelCase() throws Exception { + assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CAMEL_CASE), + "HTTPRequest", + "HTTP Request"); + assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CAMEL_CASE), + "NExpect", + "N Expect"); + assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CAMEL_CASE), + "3DPlot", + "3D Plot"); + assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CAMEL_CASE), + "Plot3D", + "Plot 3D"); + } + /* public void testToDot() throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE;