Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(starts_with, {}),
BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(ends_with, {}),

BINARY_UNSAFE_NULL_IF_NULL(locate, {"position"}, utf8, int32),

UNARY_OCTET_LEN_FN(octet_length, {}),
UNARY_OCTET_LEN_FN(bit_length, {}),

Expand Down Expand Up @@ -95,6 +97,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
NativeFunction("convert_fromUTF8", {"convert_fromutf8"}, DataTypeVector{binary()},
utf8(), kResultNullIfNull, "convert_fromUTF8_binary",
NativeFunction::kNeedsContext),

NativeFunction("locate", {"position"}, DataTypeVector{utf8(), utf8(), int32()},
int32(), kResultNullIfNull, "locate_utf8_utf8_int32",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
};

return string_fn_registry_;
Expand Down
55 changes: 55 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,29 @@ int32 utf8_length(int64 context, const char* data, int32 data_len) {
return count;
}

// Get the byte position corresponding to a character position for a non-empty utf8
// sequence
FORCE_INLINE
int32 utf8_byte_pos(int64 context, const char* str, int32 str_len, int32 char_pos) {
int char_len = 0;
int byte_index = 0;
for (int32 char_index = 0; char_index < char_pos && byte_index < str_len;
char_index++) {
char_len = utf8_char_length(str[byte_index]);
if (char_len == 0 ||
byte_index + char_len > str_len) { // invalid byte or incomplete glyph
set_error_for_invalid_utf(context, str[byte_index]);
return -1;
}
byte_index += char_len;
}
if (byte_index >= str_len) {
gdv_fn_context_set_error_msg(context, "Invalid character position argument");
return -1;
}
return byte_index;
}

#define UTF8_LENGTH(NAME, TYPE) \
FORCE_INLINE \
int32 NAME##_##TYPE(int64 context, TYPE in, int32 in_len) { \
Expand Down Expand Up @@ -410,4 +433,36 @@ const char* convert_fromUTF8_binary(int64 context, const char* bin_in, int32 len
return ret;
}

// Search for a string within another string
FORCE_INLINE
int32 locate_utf8_utf8(int64 context, const char* sub_str, int32 sub_str_len,
const char* str, int32 str_len) {
return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
}

// Search for a string within another string starting at position start-pos (1-indexed)
FORCE_INLINE
int32 locate_utf8_utf8_int32(int64 context, const char* sub_str, int32 sub_str_len,
const char* str, int32 str_len, int32 start_pos) {
if (start_pos < 1) {
gdv_fn_context_set_error_msg(context, "Start position must be greater than 0");
return 0;
}

if (str_len == 0 || sub_str_len == 0) {
return 0;
}

int32 byte_pos = utf8_byte_pos(context, str, str_len, start_pos - 1);
if (byte_pos < 0) {
return 0;
}
for (int32 i = byte_pos; i <= str_len - sub_str_len; ++i) {
if (memcmp(str + i, sub_str, sub_str_len) == 0) {
return utf8_length(context, str, i) + 1;
}
}
return 0;
}

} // extern "C"
58 changes: 58 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -309,4 +309,62 @@ TEST(TestStringOps, TestReverse) {
ctx.Reset();
}

TEST(TestStringOps, TestLocate) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<int64>(&ctx);

int pos;

pos = locate_utf8_utf8(ctx_ptr, "String", 6, "TestString", 10);
EXPECT_EQ(pos, 5);
EXPECT_FALSE(ctx.has_error());

pos = locate_utf8_utf8_int32(ctx_ptr, "String", 6, "TestString", 10, 1);
EXPECT_EQ(pos, 5);
EXPECT_FALSE(ctx.has_error());

pos = locate_utf8_utf8_int32(ctx_ptr, "abc", 3, "abcabc", 6, 2);
EXPECT_EQ(pos, 4);
EXPECT_FALSE(ctx.has_error());

pos = locate_utf8_utf8(ctx_ptr, "çåå", 6, "s†å†emçåå†d", 21);
EXPECT_EQ(pos, 7);
EXPECT_FALSE(ctx.has_error());

pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "†barbar", 9, 3);
EXPECT_EQ(pos, 5);
EXPECT_FALSE(ctx.has_error());

pos = locate_utf8_utf8_int32(ctx_ptr, "sub", 3, "", 0, 1);
EXPECT_EQ(pos, 0);
EXPECT_FALSE(ctx.has_error());

pos = locate_utf8_utf8_int32(ctx_ptr, "", 0, "str", 3, 1);
EXPECT_EQ(pos, 0);
EXPECT_FALSE(ctx.has_error());

pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "barbar", 6, 0);
EXPECT_EQ(pos, 0);
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Start position must be greater than 0"));
ctx.Reset();

pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "barbar", 6, 7);
EXPECT_EQ(pos, 0);
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr("Invalid character position argument"));
ctx.Reset();

std::string d(
"a\xff"
"c");
pos =
locate_utf8_utf8_int32(ctx_ptr, "c", 1, d.data(), static_cast<int>(d.length()), 3);
EXPECT_EQ(pos, 0);
EXPECT_THAT(ctx.get_error(),
::testing::HasSubstr(
"unexpected byte \\ff encountered while decoding utf8 string"));
ctx.Reset();
}

} // namespace gandiva
6 changes: 6 additions & 0 deletions cpp/src/gandiva/precompiled/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,12 @@ const char* lower_utf8(int64 context, const char* data, int32 data_len,

const char* reverse_utf8(int64 context, const char* data, int32 data_len,
int32_t* out_len);

int32 locate_utf8_utf8(int64 context, const char* sub_str, int32 sub_str_len,
const char* str, int32 str_len);

int32 locate_utf8_utf8_int32(int64 context, const char* sub_str, int32 sub_str_len,
const char* str, int32 str_len, int32 start_pos);
} // extern "C"

#endif // PRECOMPILED_TYPES_H