From baf20948d0057b1390366e6e08f4c05fe8aaafc2 Mon Sep 17 00:00:00 2001 From: Sagnik Chakraborty Date: Thu, 23 Jul 2020 17:57:28 +0530 Subject: [PATCH] ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string Closes #7641 from sagnikc-dremio/master and squashes the following commits: 4a9985fc5 ARROW-9328: Add LTRIM, RTRIM, BTRIM functions for string Authored-by: Sagnik Chakraborty Signed-off-by: Praveen --- cpp/src/gandiva/function_registry_string.cc | 13 +- cpp/src/gandiva/precompiled/string_ops.cc | 189 ++++++++++++- .../gandiva/precompiled/string_ops_test.cc | 258 +++++++++++++++++- cpp/src/gandiva/precompiled/types.h | 24 +- 4 files changed, 461 insertions(+), 23 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 436168d0d6faf..dd32c19ba178e 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -55,7 +55,9 @@ std::vector GetStringFunctionRegistry() { UNARY_UNSAFE_NULL_IF_NULL(length, {}, utf8, int32), UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, {}, binary, int32), UNARY_UNSAFE_NULL_IF_NULL(reverse, {}, utf8, utf8), - UNARY_UNSAFE_NULL_IF_NULL(trim, {}, utf8, utf8), + UNARY_UNSAFE_NULL_IF_NULL(ltrim, {}, utf8, utf8), + UNARY_UNSAFE_NULL_IF_NULL(rtrim, {}, utf8, utf8), + UNARY_UNSAFE_NULL_IF_NULL(btrim, {}, utf8, utf8), UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}), UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}), @@ -83,6 +85,15 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder), + NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext), + + NativeFunction("rtrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullIfNull, "rtrim_utf8_utf8", NativeFunction::kNeedsContext), + + NativeFunction("btrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullIfNull, "btrim_utf8_utf8", NativeFunction::kNeedsContext), + NativeFunction("substr", {"substring"}, DataTypeVector{utf8(), int64() /*offset*/, int64() /*length*/}, utf8(), kResultNullIfNull, "substr_utf8_int64_int64", diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 102532c193c84..f6ef79cdb5ef4 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -16,7 +16,6 @@ // under the License. // String functions - #include "arrow/util/value_parsing.h" extern "C" { @@ -286,10 +285,48 @@ const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len return ret; } -// Trim a utf8 sequence +// Trims whitespaces from the left end of the input utf8 sequence +FORCE_INLINE +const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, + int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + gdv_int32 start = 0; + // start denotes the first position of non-space characters in the input string + while (start < data_len && data[start] == ' ') { + ++start; + } + + *out_len = data_len - start; + return data + start; +} + +// Trims whitespaces from the right end of the input utf8 sequence FORCE_INLINE -const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, - int32_t* out_len) { +const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, + int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + gdv_int32 end = data_len - 1; + // end denotes the last position of non-space characters in the input string + while (end >= 0 && data[end] == ' ') { + --end; + } + + *out_len = end + 1; + return data; +} + +// Trims whitespaces from both the ends of the input utf8 sequence +FORCE_INLINE +const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, + int32_t* out_len) { if (data_len == 0) { *out_len = 0; return ""; @@ -305,21 +342,145 @@ const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, --end; } - // string with no leading/trailing spaces, return original string - if (start == 0 && end == data_len - 1) { - *out_len = data_len; - return data; + // string has some leading/trailing spaces and some non-space characters + *out_len = end - start + 1; + return data + start; +} + +// Trims characters present in the trim text from the left end of the base text +FORCE_INLINE +const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext, + gdv_int32 basetext_len, const char* trimtext, + gdv_int32 trimtext_len, int32_t* out_len) { + if (basetext_len == 0) { + *out_len = 0; + return ""; + } else if (trimtext_len == 0) { + *out_len = basetext_len; + return basetext; + } + + gdv_int32 start_ptr, char_len; + // scan the base text from left to right and increment the start pointer till + // there is a character which is not present in the trim text + for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) { + char_len = utf8_char_length(basetext[start_ptr]); + if (char_len == 0 || start_ptr + char_len > basetext_len) { + // invalid byte or incomplete glyph + set_error_for_invalid_utf(context, basetext[start_ptr]); + *out_len = 0; + return ""; + } + if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) { + break; + } } - // string with all spaces - if (start > end) { + *out_len = basetext_len - start_ptr; + return basetext + start_ptr; +} + +// Trims characters present in the trim text from the right end of the base text +FORCE_INLINE +const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext, + gdv_int32 basetext_len, const char* trimtext, + gdv_int32 trimtext_len, int32_t* out_len) { + if (basetext_len == 0) { *out_len = 0; return ""; + } else if (trimtext_len == 0) { + *out_len = basetext_len; + return basetext; + } + + gdv_int32 char_len, end_ptr, byte_cnt = 1; + // scan the base text from right to left and decrement the end pointer till + // there is a character which is not present in the trim text + for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) { + char_len = utf8_char_length(basetext[end_ptr]); + if (char_len == 0) { // trailing bytes of multibyte character + ++byte_cnt; + continue; + } + // this is the first byte of a character, hence check if char_len = char_cnt + if (byte_cnt != char_len) { // invalid byte or incomplete glyph + set_error_for_invalid_utf(context, basetext[end_ptr]); + *out_len = 0; + return ""; + } + byte_cnt = 1; // reset the counter*/ + if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) { + break; + } } - // string has some leading/trailing spaces and some non-space characters - *out_len = end - start + 1; - return data + start; + // when all characters in the basetext are part of the trimtext + if (end_ptr == -1) { + *out_len = 0; + return ""; + } + + end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character + *out_len = end_ptr; + return basetext; +} + +// Trims characters present in the trim text from both ends of the base text +FORCE_INLINE +const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext, + gdv_int32 basetext_len, const char* trimtext, + gdv_int32 trimtext_len, int32_t* out_len) { + if (basetext_len == 0) { + *out_len = 0; + return ""; + } else if (trimtext_len == 0) { + *out_len = basetext_len; + return basetext; + } + + gdv_int32 start_ptr, end_ptr, char_len, byte_cnt = 1; + // scan the base text from left to right and increment the start and decrement the + // end pointers till there are characters which are not present in the trim text + for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) { + char_len = utf8_char_length(basetext[start_ptr]); + if (char_len == 0 || start_ptr + char_len > basetext_len) { + // invalid byte or incomplete glyph + set_error_for_invalid_utf(context, basetext[start_ptr]); + *out_len = 0; + return ""; + } + if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) { + break; + } + } + for (end_ptr = basetext_len - 1; end_ptr >= start_ptr; --end_ptr) { + char_len = utf8_char_length(basetext[end_ptr]); + if (char_len == 0) { // trailing byte in multibyte character + ++byte_cnt; + continue; + } + // this is the first byte of a character, hence check if char_len = char_cnt + if (byte_cnt != char_len) { // invalid byte or incomplete glyph + set_error_for_invalid_utf(context, basetext[end_ptr]); + *out_len = 0; + return ""; + } + byte_cnt = 1; // reset the counter*/ + if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) { + break; + } + } + + // when all characters are trimmed, start_ptr has been incremented to basetext_len and + // end_ptr still points to basetext_len - 1, hence we need to handle this case + if (start_ptr > end_ptr) { + *out_len = 0; + return ""; + } + + end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character + *out_len = end_ptr - start_ptr; + return basetext + start_ptr; } // Truncates the string to given length @@ -680,7 +841,7 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, int32_t len) { \ gdv_##OUT_TYPE val = 0; \ int32_t trimmed_len; \ - data = trim_utf8(context, data, len, &trimmed_len); \ + data = btrim_utf8(context, data, len, &trimmed_len); \ if (!arrow::internal::StringConverter::Convert(data, trimmed_len, \ &val)) { \ std::string err = "Failed to cast the string " + std::string(data, trimmed_len) + \ diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 3d90fcc5b2aaa..88345d56cb845 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -426,29 +426,275 @@ TEST(TestStringOps, TestReverse) { ctx.Reset(); } -TEST(TestStringOps, TestTrim) { +TEST(TestStringOps, TestLtrim) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); gdv_int32 out_len = 0; const char* out_str; - out_str = trim_utf8(ctx_ptr, "TestString", 10, &out_len); + out_str = ltrim_utf8(ctx_ptr, "TestString ", 12, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString "); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8(ctx_ptr, " TestString ", 18, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString "); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8(ctx_ptr, " Test çåå†bD", 18, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD"); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8(ctx_ptr, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8(ctx_ptr, " ", 6, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "def"); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "ababbac", 7, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "def"); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "ååçåå†eç†Dd", 21, "çåå†", 9, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "eç†Dd"); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + std::string d( + "aa\xc3" + "bcd"); + out_str = + ltrim_utf8_utf8(ctx_ptr, d.data(), static_cast(d.length()), "a", 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), + "\xc3" + "bcd"); + EXPECT_FALSE(ctx.has_error()); + + std::string e( + "åå\xe0\xa0" + "bcd"); + out_str = + ltrim_utf8_utf8(ctx_ptr, e.data(), static_cast(e.length()), "å", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), + "\xE0\xa0" + "bcd"); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = ltrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); +} + +TEST(TestStringOps, TestRtrim) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + out_str = rtrim_utf8(ctx_ptr, " TestString", 12, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8(ctx_ptr, " TestString ", 18, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8(ctx_ptr, "Test çåå†bD ", 20, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8(ctx_ptr, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8(ctx_ptr, " ", 6, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "ring", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestSt"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "def"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "ababbac", 7, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "def"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "eDdç†ååçåå†", 21, "çåå†", 9, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "eDd"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + std::string d( + "\xc3" + "aaa"); + out_str = + rtrim_utf8_utf8(ctx_ptr, d.data(), static_cast(d.length()), "a", 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_TRUE(ctx.has_error()); + ctx.Reset(); + + std::string e( + "\xe0\xa0" + "åå"); + out_str = + rtrim_utf8_utf8(ctx_ptr, e.data(), static_cast(e.length()), "å", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_TRUE(ctx.has_error()); + ctx.Reset(); + + out_str = rtrim_utf8_utf8(ctx_ptr, "åeçå", 7, "çå", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "åe"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = rtrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); +} + +TEST(TestStringOps, TestBtrim) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + out_str = btrim_utf8(ctx_ptr, "TestString", 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); EXPECT_FALSE(ctx.has_error()); - out_str = trim_utf8(ctx_ptr, " TestString ", 18, &out_len); + out_str = btrim_utf8(ctx_ptr, " TestString ", 18, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); EXPECT_FALSE(ctx.has_error()); - out_str = trim_utf8(ctx_ptr, " Test çåå†bD ", 21, &out_len); + out_str = btrim_utf8(ctx_ptr, " Test çåå†bD ", 21, &out_len); EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD"); EXPECT_FALSE(ctx.has_error()); - out_str = trim_utf8(ctx_ptr, "", 0, &out_len); + out_str = btrim_utf8(ctx_ptr, "", 0, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); EXPECT_FALSE(ctx.has_error()); - out_str = trim_utf8(ctx_ptr, " ", 6, &out_len); + out_str = btrim_utf8(ctx_ptr, " ", 6, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "Test", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "String"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "String", 6, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Tes"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "def"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "ababbac", 7, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "def"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "ååçåå†Ddeç†", 21, "çåå†", 9, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Dde"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_FALSE(ctx.has_error()); + ctx.Reset(); + + std::string d( + "acd\xc3" + "aaa"); + out_str = + btrim_utf8_utf8(ctx_ptr, d.data(), static_cast(d.length()), "a", 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_TRUE(ctx.has_error()); + ctx.Reset(); + + std::string e( + "åbc\xe0\xa0" + "åå"); + out_str = + btrim_utf8_utf8(ctx_ptr, e.data(), static_cast(e.length()), "å", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_TRUE(ctx.has_error()); + ctx.Reset(); + + std::string f( + "aa\xc3" + "bcd"); + out_str = + btrim_utf8_utf8(ctx_ptr, f.data(), static_cast(f.length()), "a", 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), + "\xc3" + "bcd"); + EXPECT_FALSE(ctx.has_error()); + + std::string g( + "åå\xe0\xa0" + "bcå"); + out_str = + btrim_utf8_utf8(ctx_ptr, g.data(), static_cast(g.length()), "å", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), + "\xe0\xa0" + "bc"); + + out_str = btrim_utf8_utf8(ctx_ptr, "åe†çå", 10, "çå", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "e†"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + EXPECT_FALSE(ctx.has_error()); + + out_str = btrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); EXPECT_FALSE(ctx.has_error()); } diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 3a30dae2015a4..77f1589a73b2b 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -165,6 +165,8 @@ bool is_substr_utf8_utf8(const char* data, gdv_int32 data_len, const char* subst gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len); +gdv_int32 utf8_last_char_pos(gdv_int64 context, const char* data, gdv_int32 data_len); + gdv_date64 castDATE_utf8(int64_t execution_context, const char* input, gdv_int32 length); gdv_date64 castDATE_int64(gdv_int64 date); @@ -200,8 +202,26 @@ const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, int32_t* out_len); -const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, - int32_t* out_len); +const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, + int32_t* out_len); + +const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, + int32_t* out_len); + +const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, + int32_t* out_len); + +const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext, + gdv_int32 basetext_len, const char* trimtext, + gdv_int32 trimtext_len, int32_t* out_len); + +const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext, + gdv_int32 basetext_len, const char* trimtext, + gdv_int32 trimtext_len, int32_t* out_len); + +const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext, + gdv_int32 basetext_len, const char* trimtext, + gdv_int32 trimtext_len, int32_t* out_len); gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len, const char* str, gdv_int32 str_len);