Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48282][SQL] Alter string search logic for UTF8_BINARY_LCASE collation (StringReplace, FindInSet) #46682

Closed
wants to merge 15 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,19 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right));
}

/*
* Performs string replacement for ICU collations by searching for instances of the search
* string in the `src` string, with respect to the specified collation, and then replacing
* them with the replace string. The method returns a new UTF8String with all instances of the
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
* used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
*
* @param src the string to be searched in
* @param search the string to be searched for
* @param replace the string to be used as replacement
* @param collationId the collation ID to use for string search
* @return the position of the first occurrence of `match` in `set`
*/
public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -286,49 +299,47 @@ public static UTF8String replace(final UTF8String src, final UTF8String search,
return buf.build();
}

/*
* Performs string replacement for UTF8_LCASE collation by searching for instances of the search
* string in the src string, with respect to lowercased string versions, and then replacing
* them with the replace string. The method returns a new UTF8String with all instances of the
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
* used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
*
* @param src the string to be searched in
* @param search the string to be searched for
* @param replace the string to be used as replacement
* @param collationId the collation ID to use for string search
* @return the position of the first occurrence of `match` in `set`
*/
public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search,
final UTF8String replace) {
if (src.numBytes() == 0 || search.numBytes() == 0) {
return src;
}
UTF8String lowercaseString = src.toLowerCase();

// TODO(SPARK-48725): Use lowerCaseCodePoints instead of UTF8String.toLowerCase.
UTF8String lowercaseSearch = search.toLowerCase();

int start = 0;
int end = lowercaseString.indexOf(lowercaseSearch, 0);
int end = lowercaseFind(src, lowercaseSearch, start);
if (end == -1) {
// Search string was not found, so string is unchanged.
return src;
}

// Initialize byte positions
int c = 0;
int byteStart = 0; // position in byte
int byteEnd = 0; // position in byte
while (byteEnd < src.numBytes() && c < end) {
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
c += 1;
}

// At least one match was found. Estimate space needed for result.
// The 16x multiplier here is chosen to match commons-lang3's implementation.
int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
while (end != -1) {
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
buf.append(src.substring(start, end));
buf.append(replace);
// Update character positions
start = end + lowercaseSearch.numChars();
end = lowercaseString.indexOf(lowercaseSearch, start);
// Update byte positions
byteStart = byteEnd + search.numBytes();
while (byteEnd < src.numBytes() && c < end) {
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
c += 1;
}
start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
end = lowercaseFind(src, lowercaseSearch, start);
}
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
src.numBytes() - byteStart);
buf.append(src.substring(start, src.numChars()));
return buf.build();
}

Expand All @@ -344,7 +355,7 @@ public static UTF8String toUpperCase(final UTF8String target) {
}

private static UTF8String toUpperCaseSlow(final UTF8String target) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to uppercase, which only accepts a Java strings as input.
// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
return UTF8String.fromString(UCharacter.toUpperCase(target.toString()));
Expand All @@ -362,7 +373,7 @@ public static UTF8String toUpperCase(final UTF8String target, final int collatio
}

private static UTF8String toUpperCaseSlow(final UTF8String target, final int collationId) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to uppercase, which only accepts a Java strings as input.
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand All @@ -382,7 +393,7 @@ public static UTF8String toLowerCase(final UTF8String target) {
}

private static UTF8String toLowerCaseSlow(final UTF8String target) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to lowercase, which only accepts a Java strings as input.
// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
return UTF8String.fromString(UCharacter.toLowerCase(target.toString()));
Expand All @@ -400,7 +411,7 @@ public static UTF8String toLowerCase(final UTF8String target, final int collatio
}

private static UTF8String toLowerCaseSlow(final UTF8String target, final int collationId) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to lowercase, which only accepts a Java strings as input.
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand Down Expand Up @@ -461,7 +472,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) {
* Convert the input string to titlecase using the ICU root locale rules.
*/
public static UTF8String toTitleCase(final UTF8String target) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to titlecase, which only accepts a Java strings as input.
// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
return UTF8String.fromString(UCharacter.toTitleCase(target.toString(),
Expand All @@ -479,34 +490,40 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio
BreakIterator.getWordInstance(locale)));
}

/*
* Returns the position of the first occurrence of the match string in the set string,
* counting ASCII commas as delimiters. The match string is compared in a collation-aware manner,
* with respect to the specified collation ID. Similar to UTF8String.findInSet behavior used
* for UTF8_BINARY collation, the method returns 0 if the match string contains no commas.
*
* @param match the string to be searched for
* @param set the string to be searched in
* @param collationId the collation ID to use for string comparison
* @return the position of the first occurrence of `match` in `set`
*/
public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
// If the "word" string contains a comma, FindInSet should return 0.
if (match.contains(UTF8String.fromString(","))) {
return 0;
}

// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
String setString = set.toString();
StringSearch stringSearch = CollationFactory.getStringSearch(setString, match.toString(),
collationId);

int wordStart = 0;
while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ',';
boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length()
|| setString.charAt(wordStart + stringSearch.getMatchLength()) == ',';

if (isValidStart && isValidEnd) {
int pos = 0;
for (int i = 0; i < setString.length() && i < wordStart; i++) {
if (setString.charAt(i) == ',') {
pos++;
}
// Otherwise, search for commas in "set" and compare each substring with "word".
int byteIndex = 0, charIndex = 0, wordCount = 1, lastComma = -1;
while (byteIndex < set.numBytes()) {
byte nextByte = set.getByte(byteIndex);
if (nextByte == (byte) ',') {
if (set.substring(lastComma + 1, charIndex).semanticEquals(match, collationId)) {
return wordCount;
}

return pos + 1;
lastComma = charIndex;
++wordCount;
}
byteIndex += UTF8String.numBytesForFirstByte(nextByte);
++charIndex;
}

if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) {
return wordCount;
}
// If no match is found, return 0.
return 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,31 +318,24 @@ public static int exec(final UTF8String word, final UTF8String set, final int co
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(word, set);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(word, set);
} else {
return execICU(word, set, collationId);
return execCollationAware(word, set, collationId);
}
}
public static String genCode(final String word, final String set, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.FindInSet.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", word, set);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", word, set);
} else {
return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId);
return String.format(expr + "execCollationAware(%s, %s, %d)", word, set, collationId);
}
}
public static int execBinary(final UTF8String word, final UTF8String set) {
return set.findInSet(word);
}
public static int execLowercase(final UTF8String word, final UTF8String set) {
return set.toLowerCase().findInSet(word.toLowerCase());
}
public static int execICU(final UTF8String word, final UTF8String set,
final int collationId) {
public static int execCollationAware(final UTF8String word, final UTF8String set,
final int collationId) {
return CollationAwareUTF8String.findInSet(word, set, collationId);
}
}
Expand Down
Loading