Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48282][SQL] Alter string search logic for UTF8_BINARY_LCASE collation (StringReplace, FindInSet) #46682

Closed
wants to merge 15 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,155 @@
* Utility class for collation-aware UTF8String operations.
*/
public class CollationAwareUTF8String {

/**
* The constant value to indicate that the match is not found when searching for a pattern
* string in a target string.
*/
private static final int MATCH_NOT_FOUND = -1;

/**
* Returns whether the target string starts with the specified prefix, starting from the
* specified position (0-based index referring to character position in UTF8String), with respect
* to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
* prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
* same prefix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
*/
public static boolean lowercaseMatchFrom(
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that starts with the specified
* prefix, starting from the specified position (0-based index referring to character position
* in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* prefix is already lowercased. The method only considers the part of target string that
* starts from the specified (inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the prefix is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return length of the target substring that ends with the specified prefix in lowercase
*/
public static int lowercaseMatchLengthFrom(
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int len = 0; len <= target.numChars() - startPos; ++len) {
if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target
*/
public static int lowercaseFind(
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int i = startPos; i <= target.numChars(); ++i) {
if (lowercaseMatchFrom(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns whether the target string ends with the specified suffix, ending at the specified
* position (0-based index referring to character position in UTF8String), with respect to the
* UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
* to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
* suffix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return whether the target string ends with the specified suffix in lowercase
*/
public static boolean lowercaseMatchUntil(
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that ends with the specified
* suffix, ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* suffix is already lowercased. The method only considers the part of target string that ends
* at the specified (non-inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the suffix is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return length of the target substring that ends with the specified suffix in lowercase
*/
public static int lowercaseMatchLengthUntil(
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int len = 0; len <= endPos; ++len) {
if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the last occurrence of the pattern string in the target string,
* ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return the position of the last occurrence of pattern in target
*/
public static int lowercaseRFind(
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int i = endPos; i >= 0; --i) {
if (lowercaseMatchUntil(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

uros-db marked this conversation as resolved.
Show resolved Hide resolved
public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -94,44 +243,28 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String
if (src.numBytes() == 0 || search.numBytes() == 0) {
return src;
}
UTF8String lowercaseString = src.toLowerCase();

UTF8String lowercaseSearch = search.toLowerCase();

int start = 0;
int end = lowercaseString.indexOf(lowercaseSearch, 0);
int end = lowercaseFind(src, lowercaseSearch, start);
if (end == -1) {
// Search string was not found, so string is unchanged.
return src;
}

// Initialize byte positions
int c = 0;
int byteStart = 0; // position in byte
int byteEnd = 0; // position in byte
while (byteEnd < src.numBytes() && c < end) {
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
c += 1;
}

// At least one match was found. Estimate space needed for result.
// The 16x multiplier here is chosen to match commons-lang3's implementation.
int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
while (end != -1) {
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
buf.append(src.substring(start, end));
buf.append(replace);
// Update character positions
start = end + lowercaseSearch.numChars();
end = lowercaseString.indexOf(lowercaseSearch, start);
// Update byte positions
byteStart = byteEnd + search.numBytes();
while (byteEnd < src.numBytes() && c < end) {
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
c += 1;
}
start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
end = lowercaseFind(src, lowercaseSearch, start);
}
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
src.numBytes() - byteStart);
buf.append(src.substring(start, src.numChars()));
return buf.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

import static org.junit.jupiter.api.Assertions.*;


// checkstyle.off: AvoidEscapedUnicodeCharacters
public class CollationSupportSuite {

/**
Expand Down Expand Up @@ -610,8 +610,42 @@ public void testFindInSet() throws SparkException {
assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4);
assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5);
assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5);
assertFindInSet("i̇", "İ", "UNICODE_CI", 1);
uros-db marked this conversation as resolved.
Show resolved Hide resolved
assertFindInSet("i", "İ", "UNICODE_CI", 0);
assertFindInSet("i̇", "i̇", "UNICODE_CI", 1);
assertFindInSet("i", "i̇", "UNICODE_CI", 0);
assertFindInSet("i̇", "İ,", "UNICODE_CI", 1);
assertFindInSet("i", "İ,", "UNICODE_CI", 0);
assertFindInSet("i̇", "i̇,", "UNICODE_CI", 1);
assertFindInSet("i", "i̇,", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,İ", "UNICODE_CI", 2);
assertFindInSet("i", "ab,İ", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,i̇", "UNICODE_CI", 2);
assertFindInSet("i", "ab,i̇", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,İ,12", "UNICODE_CI", 2);
assertFindInSet("i", "ab,İ,12", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,i̇,12", "UNICODE_CI", 2);
assertFindInSet("i", "ab,i̇,12", "UNICODE_CI", 0);
assertFindInSet("i̇o", "ab,İo,12", "UNICODE_CI", 2);
assertFindInSet("İo", "ab,i̇o,12", "UNICODE_CI", 2);
assertFindInSet("i̇", "İ", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "İ", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "i̇", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "i̇", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "İ,", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "İ,", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "i̇,", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "i̇,", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,İ", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,İ", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,i̇", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,i̇", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,İ,12", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,İ,12", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,i̇,12", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2);
assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2);
uros-db marked this conversation as resolved.
Show resolved Hide resolved
}

private void assertReplace(String source, String search, String replace, String collationName,
Expand Down Expand Up @@ -648,8 +682,22 @@ public void testReplace() throws SparkException {
assertReplace("replace", "", "123", "UNICODE_CI", "replace");
assertReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c");
assertReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad");
assertReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12");
assertReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12");
assertReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12");
assertReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12");
assertReplace("İi̇İi̇İi̇", "i̇", "x", "UNICODE_CI", "xxxxxx");
assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx");
assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abẊ12"); // != UNICODE_CI
uros-db marked this conversation as resolved.
Show resolved Hide resolved
assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI
assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12");
assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12");
assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx");
assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", "İẋİẋİẋ"); // != UNICODE_CI
uros-db marked this conversation as resolved.
Show resolved Hide resolved
assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx");
assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy");
}

private void assertLocate(String substring, String string, Integer start, String collationName,
Expand Down Expand Up @@ -1008,3 +1056,4 @@ public void testStringTrim() throws SparkException {
// TODO: Test other collation-aware expressions.

}
// checkstyle.on: AvoidEscapedUnicodeCharacters