Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-15699: [C++][Gandiva] Fix implementation of left and right func… #12440

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
65 changes: 34 additions & 31 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2163,31 +2163,37 @@ const char* left_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_
return "";
}

int32_t char_count = utf8_length(context, text, text_len);

// char_count is zero if input has invalid utf8 char
if (char_count == 0) {
*out_len = 0;
return "";
}

// case where left('abcdef', -6) -> "" and left('abcdef', -7) -> ""
if (number < 0 && -(number) >= char_count) {
*out_len = 0;
return "";
}

// iterate over the utf8 string validating each character
int char_len;
int char_count = 0;
int current_char_count = 0;
int byte_index = 0;
for (int i = 0; i < text_len; i += char_len) {
char_len = utf8_char_length(text[i]);
if (char_len == 0 || i + char_len > text_len) { // invalid byte or incomplete glyph
set_error_for_invalid_utf(context, text[i]);
*out_len = 0;
return "";
}
for (int j = 1; j < char_len; ++j) {
if ((text[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
set_error_for_invalid_utf(context, text[i + j]);
*out_len = 0;
return "";
}
}
byte_index += char_len;
++char_count;
++current_char_count;
// Define the rules to stop the iteration over the string
// case where left('abc', 5) -> 'abc'
if (number > 0 && char_count == number) break;
if (number > 0 && current_char_count == number) {
break;
}
// case where left('abc', -5) ==> ''
if (number < 0 && char_count == number + text_len) break;
if (number < 0 && current_char_count == number + char_count) {
break;
}
}

*out_len = byte_index;
Expand All @@ -2209,37 +2215,34 @@ const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text

// initially counts the number of utf8 characters in the defined text
int32_t char_count = utf8_length(context, text, text_len);

// char_count is zero if input has invalid utf8 char
if (char_count == 0) {
*out_len = 0;
return "";
}

// case where right('abcdef', -6) -> "" and right('abcdef', -7) -> ""
if (number < 0 && -(number) >= char_count) {
*out_len = 0;
return "";
}

int32_t start_char_pos; // the char result start position (inclusive)
int32_t end_char_len; // the char result end position (inclusive)

if (number > 0) {
// case where right('abc', 5) ==> 'abc' start_char_pos=1.
start_char_pos = (char_count > number) ? char_count - number : 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this logic is not correct for multibyte unicode characters.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nvm. looks like it is being handled in following lines

end_char_len = char_count - start_char_pos;
} else {
start_char_pos = number * -1;
end_char_len = char_count - start_char_pos;
}

// calculate the start byte position and the output length
// calculate the start byte position
int32_t start_byte_pos = utf8_byte_pos(context, text, text_len, start_char_pos);
*out_len = utf8_byte_pos(context, text, text_len, end_char_len);

// try to allocate memory for the response
char* ret =
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, *out_len));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
memcpy(ret, text + start_byte_pos, *out_len);
return ret;
// calculate output length
*out_len = (text_len - start_byte_pos);
return text + start_byte_pos;
}

FORCE_INLINE
Expand Down
24 changes: 24 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1734,10 +1734,22 @@ TEST(TestStringOps, TestLeftString) {
output = std::string(out_str, out_len);
EXPECT_EQ(output, "TestStr");

out_str = left_utf8_int32(ctx_ptr, "TestString", 10, -10, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");

out_str = left_utf8_int32(ctx_ptr, "TestString", 10, -11, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");

// the text length for this string is 10 (each utf8 char is represented by two bytes)
out_str = left_utf8_int32(ctx_ptr, "абвгд", 10, 3, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "абв");

out_str = left_utf8_int32(ctx_ptr, "¥¥abdc", 8, -6, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");
}

TEST(TestStringOps, TestRightString) {
Expand Down Expand Up @@ -1766,10 +1778,22 @@ TEST(TestStringOps, TestRightString) {
output = std::string(out_str, out_len);
EXPECT_EQ(output, "tString");

out_str = right_utf8_int32(ctx_ptr, "TestString", 10, -10, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");

out_str = right_utf8_int32(ctx_ptr, "TestString", 10, -11, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");

// the text length for this string is 10 (each utf8 char is represented by two bytes)
out_str = right_utf8_int32(ctx_ptr, "абвгд", 10, 3, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "вгд");

out_str = right_utf8_int32(ctx_ptr, "¥¥abdc", 8, -6, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");
}

TEST(TestStringOps, TestBinaryString) {
Expand Down