apache · nivia007 · Mar 17, 2022 · projjal · Mar 16, 2022 · projjal
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -2163,31 +2163,37 @@ const char* left_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_
     return "";
   }
 
+  int32_t char_count = utf8_length(context, text, text_len);
+
+  // char_count is zero if input has invalid utf8 char
+  if (char_count == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // case where left('abcdef', -6) -> "" and left('abcdef', -7) -> ""
+  if (number < 0 && -(number) >= char_count) {
+    *out_len = 0;
+    return "";
+  }
+
   // iterate over the utf8 string validating each character
   int char_len;
-  int char_count = 0;
+  int current_char_count = 0;
   int byte_index = 0;
   for (int i = 0; i < text_len; i += char_len) {
     char_len = utf8_char_length(text[i]);
-    if (char_len == 0 || i + char_len > text_len) {  // invalid byte or incomplete glyph
-      set_error_for_invalid_utf(context, text[i]);
-      *out_len = 0;
-      return "";
-    }
-    for (int j = 1; j < char_len; ++j) {
-      if ((text[i + j] & 0xC0) != 0x80) {  // bytes following head-byte of glyph
-        set_error_for_invalid_utf(context, text[i + j]);
-        *out_len = 0;
-        return "";
-      }
-    }
     byte_index += char_len;
-    ++char_count;
+    ++current_char_count;
     // Define the rules to stop the iteration over the string
     // case where left('abc', 5) -> 'abc'
-    if (number > 0 && char_count == number) break;
+    if (number > 0 && current_char_count == number) {
+      break;
+    }
     // case where left('abc', -5) ==> ''
-    if (number < 0 && char_count == number + text_len) break;
+    if (number < 0 && current_char_count == number + char_count) {
+      break;
+    }
   }
 
   *out_len = byte_index;
@@ -2209,37 +2215,34 @@ const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text
 
   // initially counts the number of utf8 characters in the defined text
   int32_t char_count = utf8_length(context, text, text_len);
+
   // char_count is zero if input has invalid utf8 char
   if (char_count == 0) {
     *out_len = 0;
     return "";
   }
 
+  // case where right('abcdef', -6) -> "" and right('abcdef', -7) -> ""
+  if (number < 0 && -(number) >= char_count) {
+    *out_len = 0;
+    return "";
+  }
+
   int32_t start_char_pos;  // the char result start position (inclusive)
-  int32_t end_char_len;    // the char result end position (inclusive)
+
   if (number > 0) {
     // case where right('abc', 5) ==> 'abc' start_char_pos=1.
     start_char_pos = (char_count > number) ? char_count - number : 0;
-    end_char_len = char_count - start_char_pos;
   } else {
     start_char_pos = number * -1;
-    end_char_len = char_count - start_char_pos;
   }
 
-  // calculate the start byte position and the output length
+  // calculate the start byte position
   int32_t start_byte_pos = utf8_byte_pos(context, text, text_len, start_char_pos);
-  *out_len = utf8_byte_pos(context, text, text_len, end_char_len);
 
-  // try to allocate memory for the response
-  char* ret =
-      reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, *out_len));
-  if (ret == nullptr) {
-    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
-    *out_len = 0;
-    return "";
-  }
-  memcpy(ret, text + start_byte_pos, *out_len);
-  return ret;
+  // calculate output length
+  *out_len = (text_len - start_byte_pos);
+  return text + start_byte_pos;
 }
 
 FORCE_INLINE

diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -1734,10 +1734,22 @@ TEST(TestStringOps, TestLeftString) {
   output = std::string(out_str, out_len);
   EXPECT_EQ(output, "TestStr");
 
+  out_str = left_utf8_int32(ctx_ptr, "TestString", 10, -10, &out_len);
+  output = std::string(out_str, out_len);
+  EXPECT_EQ(output, "");
+
+  out_str = left_utf8_int32(ctx_ptr, "TestString", 10, -11, &out_len);
+  output = std::string(out_str, out_len);
+  EXPECT_EQ(output, "");
+
   // the text length for this string is 10 (each utf8 char is represented by two bytes)
   out_str = left_utf8_int32(ctx_ptr, "абвгд", 10, 3, &out_len);
   output = std::string(out_str, out_len);
   EXPECT_EQ(output, "абв");
+
+  out_str = left_utf8_int32(ctx_ptr, "¥¥abdc", 8, -6, &out_len);
+  output = std::string(out_str, out_len);
+  EXPECT_EQ(output, "");
 }
 
 TEST(TestStringOps, TestRightString) {
@@ -1766,10 +1778,22 @@ TEST(TestStringOps, TestRightString) {
   output = std::string(out_str, out_len);
   EXPECT_EQ(output, "tString");
 
+  out_str = right_utf8_int32(ctx_ptr, "TestString", 10, -10, &out_len);
+  output = std::string(out_str, out_len);
+  EXPECT_EQ(output, "");
+
+  out_str = right_utf8_int32(ctx_ptr, "TestString", 10, -11, &out_len);
+  output = std::string(out_str, out_len);
+  EXPECT_EQ(output, "");
+
   // the text length for this string is 10 (each utf8 char is represented by two bytes)
   out_str = right_utf8_int32(ctx_ptr, "абвгд", 10, 3, &out_len);
   output = std::string(out_str, out_len);
   EXPECT_EQ(output, "вгд");
+
+  out_str = right_utf8_int32(ctx_ptr, "¥¥abdc", 8, -6, &out_len);
+  output = std::string(out_str, out_len);
+  EXPECT_EQ(output, "");
 }
 
 TEST(TestStringOps, TestBinaryString) {