apache · uros-db · May 9, 2024 · May 9, 2024 · May 10, 2024 · May 10, 2024
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -118,7 +118,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.contains(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.containsInLowerCase(r);
+      if (r.numChars() == 0) return true;
+      if (l.numChars() < r.numChars()) return false;
+      return CollationAwareUTF8String.lowercaseIndexOf(l, r, 0) >= 0;
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -156,7 +158,9 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.startsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.startsWithInLowerCase(r);
+      if (r.numBytes() == 0) return true;
+      if (l.numChars() < r.numChars()) return false;
+      return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(), 0, r.numChars());
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -193,7 +197,10 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) {
       return l.endsWith(r);
     }
     public static boolean execLowercase(final UTF8String l, final UTF8String r) {
-      return l.endsWithInLowerCase(r);
+      if (r.numBytes() == 0) return true;
+      if (l.numChars() < r.numChars()) return false;
+      return CollationAwareUTF8String.lowercaseMatchAt(l, r.toLowerCase(),
+        l.numChars() - r.numChars(), r.numChars());
     }
     public static boolean execICU(final UTF8String l, final UTF8String r,
         final int collationId) {
@@ -354,7 +361,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
       return string.indexOf(substring, 0);
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring) {
-      return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
+      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
     }
     public static int execICU(final UTF8String string, final UTF8String substring,
         final int collationId) {
@@ -430,7 +437,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
     }
     public static int execLowercase(final UTF8String string, final UTF8String substring,
         final int start) {
-      return string.toLowerCase().indexOf(substring.toLowerCase(), start);
+      return CollationAwareUTF8String.lowercaseIndexOf(string, substring, start);
     }
     public static int execICU(final UTF8String string, final UTF8String substring, final int start,
         final int collationId) {

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -341,44 +341,6 @@ public boolean contains(final UTF8String substring) {
     return false;
   }
 
-  /**
-   * Returns whether `this` contains `substring` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean containsInLowerCase(final UTF8String substring) {
-    if (substring.numBytes == 0) {
-      return true;
-    }
-
-    // Both `this` and the `substring` are checked for non-ASCII characters, otherwise we would
-    // have to use `startsWithLowerCase(...)` in a loop, and it would frequently allocate
-    // (e.g. in case of `containsInLowerCase("1大1大1大...", "11")`)
-    if (!substring.isFullAscii()) {
-      return toLowerCase().contains(substring.toLowerCaseSlow());
-    }
-    if (!isFullAscii()) {
-      return toLowerCaseSlow().contains(substring.toLowerCaseAscii());
-    }
-
-    if (numBytes < substring.numBytes) {
-      return false;
-    }
-
-    final var firstLower = Character.toLowerCase(substring.getByte(0));
-    for (var i = 0; i <= (numBytes - substring.numBytes); i++) {
-      if (Character.toLowerCase(getByte(i)) == firstLower) {
-        final var rest = UTF8String.fromAddress(base, offset + i, numBytes - i);
-        if (rest.matchAtInLowerCaseAscii(substring, 0)) {
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
   /**
    * Returns the byte at position `i`.
    */
@@ -393,94 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) {
     return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
   }
 
-  private boolean matchAtInLowerCaseAscii(final UTF8String s, int pos) {
-    if (s.numBytes + pos > numBytes || pos < 0) {
-      return false;
-    }
-
-    for (var i = 0; i < s.numBytes; i++) {
-      if (Character.toLowerCase(getByte(pos + i)) != Character.toLowerCase(s.getByte(i))) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
   public boolean startsWith(final UTF8String prefix) {
     return matchAt(prefix, 0);
   }
 
-  /**
-   * Checks whether `prefix` is a prefix of `this` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean startsWithInLowerCase(final UTF8String prefix) {
-    // No way to match sizes of strings for early return, since single grapheme can be expanded
-    // into several independent ones in lowercase
-    if (prefix.numBytes == 0) {
-      return true;
-    }
-    if (numBytes == 0) {
-      return false;
-    }
-
-    if (!prefix.isFullAscii()) {
-      return toLowerCase().startsWith(prefix.toLowerCaseSlow());
-    }
-
-    final var part = prefix.numBytes >= numBytes ? this : UTF8String.fromAddress(
-      base, offset, prefix.numBytes);
-    if (!part.isFullAscii()) {
-      return toLowerCaseSlow().startsWith(prefix.toLowerCaseAscii());
-    }
-
-    if (numBytes < prefix.numBytes) {
-      return false;
-    }
-
-    return matchAtInLowerCaseAscii(prefix, 0);
-  }
-
   public boolean endsWith(final UTF8String suffix) {
     return matchAt(suffix, numBytes - suffix.numBytes);
   }
 
-  /**
-   * Checks whether `suffix` is a suffix of `this` in a lowercase unicode-aware manner
-   *
-   * This function is written in a way which avoids excessive allocations in case if we work with
-   * bare ASCII-character strings.
-   */
-  public boolean endsWithInLowerCase(final UTF8String suffix) {
-    // No way to match sizes of strings for early return, since single grapheme can be expanded
-    // into several independent ones in lowercase
-    if (suffix.numBytes == 0) {
-      return true;
-    }
-    if (numBytes == 0) {
-      return false;
-    }
-
-    if (!suffix.isFullAscii()) {
-      return toLowerCase().endsWith(suffix.toLowerCaseSlow());
-    }
-
-    final var part = suffix.numBytes >= numBytes ? this : UTF8String.fromAddress(
-      base, offset + numBytes - suffix.numBytes, suffix.numBytes);
-    if (!part.isFullAscii()) {
-      return toLowerCaseSlow().endsWith(suffix.toLowerCaseAscii());
-    }
-
-    if (numBytes < suffix.numBytes) {
-      return false;
-    }
-
-    return matchAtInLowerCaseAscii(suffix, numBytes - suffix.numBytes);
-  }
-
   /**
    * Returns the upper case of this string
    */

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -102,20 +102,28 @@ public void testContains() throws SparkException {
     assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
     // Case-variable character length
-    assertContains("abİo12", "i̇o", "UNICODE_CI", true);
-    assertContains("abi̇o12", "İo", "UNICODE_CI", true);
-    assertContains("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true);
-    assertContains("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true);
-    assertContains("The İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", true);
-    assertContains("İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true);
-    assertContains("İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", false);
-    // Characters with the same binary lowercase representation
-    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false);
+    assertContains("i̇", "i", "UNICODE_CI", false);
+    assertContains("i̇", "İ", "UNICODE_CI", true);
+    assertContains("İ", "i", "UNICODE_CI", false);
+    assertContains("adi̇os", "io", "UNICODE_CI", false);
+    assertContains("adi̇os", "Io", "UNICODE_CI", false);
+    assertContains("adi̇os", "i̇o", "UNICODE_CI", true);
+    assertContains("adi̇os", "İo", "UNICODE_CI", true);
+    assertContains("adİos", "io", "UNICODE_CI", false);
+    assertContains("adİos", "Io", "UNICODE_CI", false);
+    assertContains("adİos", "i̇o", "UNICODE_CI", true);
+    assertContains("adİos", "İo", "UNICODE_CI", true);
+    assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI
+    assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true);
+    assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("adİos", "io", "UTF8_BINARY_LCASE", false);
+    assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false);
+    assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertContains("adİos", "İo", "UTF8_BINARY_LCASE", true);
   }
 
   private void assertStartsWith(
@@ -191,20 +199,36 @@ public void testStartsWith() throws SparkException {
     assertStartsWith("ab世De", "AB世dE", "UNICODE_CI", true);
     assertStartsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertStartsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
-    // Case-variable character length
-    assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
-    assertStartsWith("i̇onic", "İo", "UNICODE_CI", true);
-    assertStartsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", true);
-    assertStartsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false);
     // Characters with the same binary lowercase representation
     assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertStartsWith("KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false);
+    // Case-variable character length
+    assertStartsWith("i̇", "i", "UNICODE_CI", false);
+    assertStartsWith("i̇", "İ", "UNICODE_CI", true);
+    assertStartsWith("İ", "i", "UNICODE_CI", false);
+    assertStartsWith("i̇onic", "io", "UNICODE_CI", false);
+    assertStartsWith("i̇onic", "Io", "UNICODE_CI", false);
+    assertStartsWith("i̇onic", "i̇o", "UNICODE_CI", true);
+    assertStartsWith("i̇onic", "İo", "UNICODE_CI", true);
+    assertStartsWith("İonic", "io", "UNICODE_CI", false);
+    assertStartsWith("İonic", "Io", "UNICODE_CI", false);
+    assertStartsWith("İonic", "i̇o", "UNICODE_CI", true);
+    assertStartsWith("İonic", "İo", "UNICODE_CI", true);
+    assertStartsWith("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI
+    assertStartsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("İ", "i", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("i̇onic", "io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("i̇onic", "Io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("i̇onic", "i̇o", "UTF8_BINARY_LCASE", true);
+    assertStartsWith("i̇onic", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("İonic", "io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("İonic", "Io", "UTF8_BINARY_LCASE", false);
+    assertStartsWith("İonic", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertStartsWith("İonic", "İo", "UTF8_BINARY_LCASE", true);
   }
 
   private void assertEndsWith(String pattern, String suffix, String collationName, boolean expected)
@@ -279,20 +303,36 @@ public void testEndsWith() throws SparkException {
     assertEndsWith("ab世De", "AB世dE", "UNICODE_CI", true);
     assertEndsWith("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertEndsWith("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
-    // Case-variable character length
-    assertEndsWith("The İo", "i̇o", "UNICODE_CI", true);
-    assertEndsWith("The i̇o", "İo", "UNICODE_CI", true);
-    assertEndsWith("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("The İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true);
-    assertEndsWith("The İodiNe", "i̇oDin", "UTF8_BINARY_LCASE", false);
     // Characters with the same binary lowercase representation
     assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The Kelvin", "Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The KKelvin", "KKelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_BINARY_LCASE", true);
     assertEndsWith("The KKelvin", "KKelvin,", "UTF8_BINARY_LCASE", false);
+    // Case-variable character length
+    assertEndsWith("i̇", "i", "UNICODE_CI", false);
+    assertEndsWith("i̇", "İ", "UNICODE_CI", true);
+    assertEndsWith("İ", "i", "UNICODE_CI", false);
+    assertEndsWith("the i̇o", "io", "UNICODE_CI", false);
+    assertEndsWith("the i̇o", "Io", "UNICODE_CI", false);
+    assertEndsWith("the i̇o", "i̇o", "UNICODE_CI", true);
+    assertEndsWith("the i̇o", "İo", "UNICODE_CI", true);
+    assertEndsWith("the İo", "io", "UNICODE_CI", false);
+    assertEndsWith("the İo", "Io", "UNICODE_CI", false);
+    assertEndsWith("the İo", "i̇o", "UNICODE_CI", true);
+    assertEndsWith("the İo", "İo", "UNICODE_CI", true);
+    assertEndsWith("i̇", "i", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("İ", "i", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the i̇o", "io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the i̇o", "Io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the i̇o", "i̇o", "UTF8_BINARY_LCASE", true);
+    assertEndsWith("the i̇o", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("the İo", "io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the İo", "Io", "UTF8_BINARY_LCASE", false);
+    assertEndsWith("the İo", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI
+    assertEndsWith("the İo", "İo", "UTF8_BINARY_LCASE", true);
   }
 
   private void assertStringSplitSQL(String str, String delimiter, String collationName,
@@ -709,6 +749,16 @@ public void testLocate() throws SparkException {
     assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9);
     assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1);
     // Case-variable character length
+    assertLocate("̇", "i̇", 1, "UTF8_BINARY", 2);
+    assertLocate("̇", "İ", 1, "UTF8_BINARY_LCASE", 0); // note: different from UTF8_BINARY
+    assertLocate("i", "i̇", 1, "UNICODE_CI", 0);
+    assertLocate("i̇", "i", 1, "UNICODE_CI", 0);
+    assertLocate("İ", "i̇", 1, "UNICODE_CI", 1);
+    assertLocate("İ", "i", 1, "UNICODE_CI", 0);
+    assertLocate("i", "i̇", 1, "UTF8_BINARY_LCASE", 1); // note: different from UNICODE_CI
+    assertLocate("i̇", "i", 1, "UTF8_BINARY_LCASE", 0);
+    assertLocate("İ", "i̇", 1, "UTF8_BINARY_LCASE", 0); // note: different from UNICODE_CI
+    assertLocate("İ", "i", 1, "UTF8_BINARY_LCASE", 0);
     assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1);
     assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3);
     assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11);