Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48221][SQL] Alter string search logic for UTF8_BINARY_LCASE collation (Contains, StartsWith, EndsWith, StringLocate) #46511

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,27 @@
* Utility class for collation-aware UTF8String operations.
*/
public class CollationAwareUTF8String {

public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) {
if (pos < 0) return false;
uros-db marked this conversation as resolved.
Show resolved Hide resolved
for (int len = 0; len <= l.numChars() - pos; len++) {
uros-db marked this conversation as resolved.
Show resolved Hide resolved
if (l.substring(pos, pos + len).toLowerCase().equals(r)) {
uros-db marked this conversation as resolved.
Show resolved Hide resolved
return true;
}
}
return false;
}

public static boolean lowercaseMatchUntil(final UTF8String l, final UTF8String r, int pos) {
if (pos > l.numChars()) return false;
uros-db marked this conversation as resolved.
Show resolved Hide resolved
for (int len = 1; len <= pos; len++) {
uros-db marked this conversation as resolved.
Show resolved Hide resolved
if (l.substring(pos - len, pos).toLowerCase().equals(r)) {
return true;
}
}
return false;
}

public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -183,6 +204,19 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
return 0;
}

public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final int start) {
if (pattern.numChars() == 0) return 0;
uros-db marked this conversation as resolved.
Show resolved Hide resolved
int lenHaystack = target.numChars(), lenNeedle = pattern.numChars();
final UTF8String needle = pattern.toLowerCase();
for (int i = start; i <= (lenHaystack - lenNeedle); i++) {
uros-db marked this conversation as resolved.
Show resolved Hide resolved
if (CollationAwareUTF8String.lowercaseMatchFrom(target, needle, i)) {
return i;
}
}
return -1;
}

public static int indexOf(final UTF8String target, final UTF8String pattern,
final int start, final int collationId) {
if (pattern.numBytes() == 0) {
Expand Down Expand Up @@ -467,4 +501,7 @@ public static UTF8String lowercaseTrimRight(
}
return srcString.copyUTF8String(0, trimByteIdx);
}

// TODO: Add more collation-aware UTF8String operations here.

}
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.contains(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.containsInLowerCase(r);
if (r.numBytes() == 0) return true;
uros-db marked this conversation as resolved.
Show resolved Hide resolved
return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
Expand Down Expand Up @@ -156,7 +157,8 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.startsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.startsWithInLowerCase(r);
if (r.numBytes() == 0) return true;
uros-db marked this conversation as resolved.
Show resolved Hide resolved
return CollationAwareUTF8String.lowercaseMatchFrom(l, r.toLowerCase(), 0);
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
Expand Down Expand Up @@ -193,7 +195,8 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.endsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.endsWithInLowerCase(r);
if (r.numBytes() == 0) return true;
uros-db marked this conversation as resolved.
Show resolved Hide resolved
return CollationAwareUTF8String.lowercaseMatchUntil(l, r.toLowerCase(), l.numChars());
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
Expand Down Expand Up @@ -354,7 +357,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
return string.indexOf(substring, 0);
}
public static int execLowercase(final UTF8String string, final UTF8String substring) {
return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
}
public static int execICU(final UTF8String string, final UTF8String substring,
final int collationId) {
Expand Down Expand Up @@ -430,7 +433,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
}
public static int execLowercase(final UTF8String string, final UTF8String substring,
final int start) {
return string.toLowerCase().indexOf(substring.toLowerCase(), start);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to confirm, the previous implementation here is correct, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, unfortunately it's not - while it works fine for ASCII, it actually gives wrong results in some special cases featuring conditional case mapping, when a character has a lowercase equivalent that consists of multiple characters, or is found at a particular place in the string (context-awareness)

Copy link
Contributor Author

@uros-db uros-db May 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so as part of this PR, we actually changed the core definition of string-searching in UTF8_BINARY_LCASE, i.e. what it means for one substring (pattern) to be found in another string (target) under UTF8_BINARY_LCASE

in the old implementation, contains("İ", "i") would return true - however, this behaviour is incorrect because it relies on the fact that substr(lower("İ"), 1, 1) == "i" (incorrect, old implementation), instead of lower(substr("İ", 1, 1)) != "i" (correct, new implementation)

and this is all due to the fact that lower("İ") = "i\u0307" (1 uppercase character -> 2 lowercase characters)

return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start);
}
public static int execICU(final UTF8String string, final UTF8String substring, final int start,
final int collationId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,44 +341,6 @@ public boolean contains(final UTF8String substring) {
return false;
}

/**
* Returns whether `this` contains `substring` in a lowercase unicode-aware manner
*
* This function is written in a way which avoids excessive allocations in case if we work with
* bare ASCII-character strings.
*/
public boolean containsInLowerCase(final UTF8String substring) {
if (substring.numBytes == 0) {
return true;
}

// Both `this` and the `substring` are checked for non-ASCII characters, otherwise we would
// have to use `startsWithLowerCase(...)` in a loop, and it would frequently allocate
// (e.g. in case of `containsInLowerCase("1大1大1大...", "11")`)
if (!substring.isFullAscii()) {
return toLowerCase().contains(substring.toLowerCaseSlow());
}
if (!isFullAscii()) {
return toLowerCaseSlow().contains(substring.toLowerCaseAscii());
}

if (numBytes < substring.numBytes) {
return false;
}

final var firstLower = Character.toLowerCase(substring.getByte(0));
for (var i = 0; i <= (numBytes - substring.numBytes); i++) {
if (Character.toLowerCase(getByte(i)) == firstLower) {
final var rest = UTF8String.fromAddress(base, offset + i, numBytes - i);
if (rest.matchAtInLowerCaseAscii(substring, 0)) {
return true;
}
}
}

return false;
}

/**
* Returns the byte at position `i`.
*/
Expand All @@ -393,94 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) {
return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
}

private boolean matchAtInLowerCaseAscii(final UTF8String s, int pos) {
if (s.numBytes + pos > numBytes || pos < 0) {
return false;
}

for (var i = 0; i < s.numBytes; i++) {
if (Character.toLowerCase(getByte(pos + i)) != Character.toLowerCase(s.getByte(i))) {
return false;
}
}

return true;
}

public boolean startsWith(final UTF8String prefix) {
return matchAt(prefix, 0);
}

/**
* Checks whether `prefix` is a prefix of `this` in a lowercase unicode-aware manner
*
* This function is written in a way which avoids excessive allocations in case if we work with
* bare ASCII-character strings.
*/
public boolean startsWithInLowerCase(final UTF8String prefix) {
// No way to match sizes of strings for early return, since single grapheme can be expanded
// into several independent ones in lowercase
if (prefix.numBytes == 0) {
return true;
}
if (numBytes == 0) {
return false;
}

if (!prefix.isFullAscii()) {
return toLowerCase().startsWith(prefix.toLowerCaseSlow());
}

final var part = prefix.numBytes >= numBytes ? this : UTF8String.fromAddress(
base, offset, prefix.numBytes);
if (!part.isFullAscii()) {
return toLowerCaseSlow().startsWith(prefix.toLowerCaseAscii());
}

if (numBytes < prefix.numBytes) {
return false;
}

return matchAtInLowerCaseAscii(prefix, 0);
}

public boolean endsWith(final UTF8String suffix) {
return matchAt(suffix, numBytes - suffix.numBytes);
}

/**
* Checks whether `suffix` is a suffix of `this` in a lowercase unicode-aware manner
*
* This function is written in a way which avoids excessive allocations in case if we work with
* bare ASCII-character strings.
*/
public boolean endsWithInLowerCase(final UTF8String suffix) {
// No way to match sizes of strings for early return, since single grapheme can be expanded
// into several independent ones in lowercase
if (suffix.numBytes == 0) {
return true;
}
if (numBytes == 0) {
return false;
}

if (!suffix.isFullAscii()) {
return toLowerCase().endsWith(suffix.toLowerCaseSlow());
}

final var part = suffix.numBytes >= numBytes ? this : UTF8String.fromAddress(
base, offset + numBytes - suffix.numBytes, suffix.numBytes);
if (!part.isFullAscii()) {
return toLowerCaseSlow().endsWith(suffix.toLowerCaseAscii());
}

if (numBytes < suffix.numBytes) {
return false;
}

return matchAtInLowerCaseAscii(suffix, numBytes - suffix.numBytes);
}

/**
* Returns the upper case of this string
*/
Expand Down