apache · uros-db · May 21, 2024 · May 21, 2024 · May 21, 2024 · May 24, 2024
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -34,6 +34,155 @@
  * Utility class for collation-aware UTF8String operations.
  */
 public class CollationAwareUTF8String {
+
+  /**
+   * The constant value to indicate that the match is not found when searching for a pattern
+   * string in a target string.
+   */
+  private static final int MATCH_NOT_FOUND = -1;
+
+  /**
+   * Returns whether the target string starts with the specified prefix, starting from the
+   * specified position (0-based index referring to character position in UTF8String), with respect
+   * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
+   * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
+   * same prefix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
+   */
+  public static boolean lowercaseMatchFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that starts with the specified
+   * prefix, starting from the specified position (0-based index referring to character position
+   * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * prefix is already lowercased. The method only considers the part of target string that
+   * starts from the specified (inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the prefix is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return length of the target substring that ends with the specified prefix in lowercase
+   */
+  public static int lowercaseMatchLengthFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    assert startPos >= 0;
+    for (int len = 0; len <= target.numChars() - startPos; ++len) {
+      if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target
+   */
+  public static int lowercaseFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    assert startPos >= 0;
+    for (int i = startPos; i <= target.numChars(); ++i) {
+      if (lowercaseMatchFrom(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns whether the target string ends with the specified suffix, ending at the specified
+   * position (0-based index referring to character position in UTF8String), with respect to the
+   * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
+   * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
+   * suffix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return whether the target string ends with the specified suffix in lowercase
+   */
+  public static boolean lowercaseMatchUntil(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that ends with the specified
+   * suffix, ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * suffix is already lowercased. The method only considers the part of target string that ends
+   * at the specified (non-inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the suffix is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return length of the target substring that ends with the specified suffix in lowercase
+   */
+  public static int lowercaseMatchLengthUntil(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    assert endPos <= target.numChars();
+    for (int len = 0; len <= endPos; ++len) {
+      if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the last occurrence of the pattern string in the target string,
+   * ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return the position of the last occurrence of pattern in target
+   */
+  public static int lowercaseRFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    assert endPos <= target.numChars();
+    for (int i = endPos; i >= 0; --i) {
+      if (lowercaseMatchUntil(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -94,44 +243,28 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String
     if (src.numBytes() == 0 || search.numBytes() == 0) {
       return src;
     }
-    UTF8String lowercaseString = src.toLowerCase();
+
     UTF8String lowercaseSearch = search.toLowerCase();
 
     int start = 0;
-    int end = lowercaseString.indexOf(lowercaseSearch, 0);
+    int end = lowercaseFind(src, lowercaseSearch, start);
     if (end == -1) {
       // Search string was not found, so string is unchanged.
       return src;
     }
 
-    // Initialize byte positions
-    int c = 0;
-    int byteStart = 0; // position in byte
-    int byteEnd = 0; // position in byte
-    while (byteEnd < src.numBytes() && c < end) {
-      byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
-      c += 1;
-    }
-
     // At least one match was found. Estimate space needed for result.
     // The 16x multiplier here is chosen to match commons-lang3's implementation.
     int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
     final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
     while (end != -1) {
-      buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
+      buf.append(src.substring(start, end));
       buf.append(replace);
       // Update character positions
-      start = end + lowercaseSearch.numChars();
-      end = lowercaseString.indexOf(lowercaseSearch, start);
-      // Update byte positions
-      byteStart = byteEnd + search.numBytes();
-      while (byteEnd < src.numBytes() && c < end) {
-        byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
-        c += 1;
-      }
+      start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
+      end = lowercaseFind(src, lowercaseSearch, start);
     }
-    buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
-      src.numBytes() - byteStart);
+    buf.append(src.substring(start, src.numChars()));
     return buf.build();
   }
 

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -23,7 +23,7 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
   /**
@@ -610,8 +610,42 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4);
     assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5);
     assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5);
+    assertFindInSet("i̇", "İ", "UNICODE_CI", 1);
+    assertFindInSet("i", "İ", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "i̇", "UNICODE_CI", 1);
+    assertFindInSet("i", "i̇", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "İ,", "UNICODE_CI", 1);
+    assertFindInSet("i", "İ,", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "i̇,", "UNICODE_CI", 1);
+    assertFindInSet("i", "i̇,", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,İ", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,İ", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,i̇", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,i̇", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,İ,12", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,İ,12", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,i̇,12", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,i̇,12", "UNICODE_CI", 0);
     assertFindInSet("i̇o", "ab,İo,12", "UNICODE_CI", 2);
     assertFindInSet("İo", "ab,i̇o,12", "UNICODE_CI", 2);
+    assertFindInSet("i̇", "İ", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "İ", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "i̇", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "i̇", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "İ,", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "İ,", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "i̇,", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "i̇,", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,İ", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,İ", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,i̇", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,i̇", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,İ,12", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,İ,12", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,i̇,12", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2);
   }
 
   private void assertReplace(String source, String search, String replace, String collationName,
@@ -648,8 +682,22 @@ public void testReplace() throws SparkException {
     assertReplace("replace", "", "123", "UNICODE_CI", "replace");
     assertReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c");
     assertReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad");
+    assertReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12");
+    assertReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12");
+    assertReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12");
+    assertReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12");
+    assertReplace("İi̇İi̇İi̇", "i̇", "x", "UNICODE_CI", "xxxxxx");
+    assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
     assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx");
     assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
+    assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abẊ12"); // != UNICODE_CI
+    assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI
+    assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12");
+    assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12");
+    assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx");
+    assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", "İẋİẋİẋ"); // != UNICODE_CI
+    assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx");
+    assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy");
   }
 
   private void assertLocate(String substring, String string, Integer start, String collationName,
@@ -1008,3 +1056,4 @@ public void testStringTrim() throws SparkException {
   // TODO: Test other collation-aware expressions.
 
 }
+// checkstyle.on: AvoidEscapedUnicodeCharacters