Skip to content

Commit

Permalink
ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string
Browse files Browse the repository at this point in the history
Closes #7641 from sagnikc-dremio/master and squashes the following commits:

4a9985f <Sagnik Chakraborty> ARROW-9328:  Add LTRIM, RTRIM, BTRIM functions for string

Authored-by: Sagnik Chakraborty <sagnikc@dremio.com>
Signed-off-by: Praveen <praveen@dremio.com>
  • Loading branch information
sgnkc authored and kszucs committed Jul 24, 2020
1 parent a284504 commit baf2094
Show file tree
Hide file tree
Showing 4 changed files with 461 additions and 23 deletions.
13 changes: 12 additions & 1 deletion cpp/src/gandiva/function_registry_string.cc
Expand Up @@ -55,7 +55,9 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
UNARY_UNSAFE_NULL_IF_NULL(length, {}, utf8, int32),
UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, {}, binary, int32),
UNARY_UNSAFE_NULL_IF_NULL(reverse, {}, utf8, utf8),
UNARY_UNSAFE_NULL_IF_NULL(trim, {}, utf8, utf8),
UNARY_UNSAFE_NULL_IF_NULL(ltrim, {}, utf8, utf8),
UNARY_UNSAFE_NULL_IF_NULL(rtrim, {}, utf8, utf8),
UNARY_UNSAFE_NULL_IF_NULL(btrim, {}, utf8, utf8),

UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
Expand Down Expand Up @@ -83,6 +85,15 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "gdv_fn_like_utf8_utf8",
NativeFunction::kNeedsFunctionHolder),

NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext),

NativeFunction("rtrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "rtrim_utf8_utf8", NativeFunction::kNeedsContext),

NativeFunction("btrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "btrim_utf8_utf8", NativeFunction::kNeedsContext),

NativeFunction("substr", {"substring"},
DataTypeVector{utf8(), int64() /*offset*/, int64() /*length*/},
utf8(), kResultNullIfNull, "substr_utf8_int64_int64",
Expand Down
189 changes: 175 additions & 14 deletions cpp/src/gandiva/precompiled/string_ops.cc
Expand Up @@ -16,7 +16,6 @@
// under the License.

// String functions

#include "arrow/util/value_parsing.h"

extern "C" {
Expand Down Expand Up @@ -286,10 +285,48 @@ const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len
return ret;
}

// Trim a utf8 sequence
// Trims whitespaces from the left end of the input utf8 sequence
FORCE_INLINE
const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
int32_t* out_len) {
if (data_len == 0) {
*out_len = 0;
return "";
}

gdv_int32 start = 0;
// start denotes the first position of non-space characters in the input string
while (start < data_len && data[start] == ' ') {
++start;
}

*out_len = data_len - start;
return data + start;
}

// Trims whitespaces from the right end of the input utf8 sequence
FORCE_INLINE
const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
int32_t* out_len) {
const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
int32_t* out_len) {
if (data_len == 0) {
*out_len = 0;
return "";
}

gdv_int32 end = data_len - 1;
// end denotes the last position of non-space characters in the input string
while (end >= 0 && data[end] == ' ') {
--end;
}

*out_len = end + 1;
return data;
}

// Trims whitespaces from both the ends of the input utf8 sequence
FORCE_INLINE
const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
int32_t* out_len) {
if (data_len == 0) {
*out_len = 0;
return "";
Expand All @@ -305,21 +342,145 @@ const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
--end;
}

// string with no leading/trailing spaces, return original string
if (start == 0 && end == data_len - 1) {
*out_len = data_len;
return data;
// string has some leading/trailing spaces and some non-space characters
*out_len = end - start + 1;
return data + start;
}

// Trims characters present in the trim text from the left end of the base text
FORCE_INLINE
const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
gdv_int32 basetext_len, const char* trimtext,
gdv_int32 trimtext_len, int32_t* out_len) {
if (basetext_len == 0) {
*out_len = 0;
return "";
} else if (trimtext_len == 0) {
*out_len = basetext_len;
return basetext;
}

gdv_int32 start_ptr, char_len;
// scan the base text from left to right and increment the start pointer till
// there is a character which is not present in the trim text
for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
char_len = utf8_char_length(basetext[start_ptr]);
if (char_len == 0 || start_ptr + char_len > basetext_len) {
// invalid byte or incomplete glyph
set_error_for_invalid_utf(context, basetext[start_ptr]);
*out_len = 0;
return "";
}
if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
break;
}
}

// string with all spaces
if (start > end) {
*out_len = basetext_len - start_ptr;
return basetext + start_ptr;
}

// Trims characters present in the trim text from the right end of the base text
FORCE_INLINE
const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
gdv_int32 basetext_len, const char* trimtext,
gdv_int32 trimtext_len, int32_t* out_len) {
if (basetext_len == 0) {
*out_len = 0;
return "";
} else if (trimtext_len == 0) {
*out_len = basetext_len;
return basetext;
}

gdv_int32 char_len, end_ptr, byte_cnt = 1;
// scan the base text from right to left and decrement the end pointer till
// there is a character which is not present in the trim text
for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) {
char_len = utf8_char_length(basetext[end_ptr]);
if (char_len == 0) { // trailing bytes of multibyte character
++byte_cnt;
continue;
}
// this is the first byte of a character, hence check if char_len = char_cnt
if (byte_cnt != char_len) { // invalid byte or incomplete glyph
set_error_for_invalid_utf(context, basetext[end_ptr]);
*out_len = 0;
return "";
}
byte_cnt = 1; // reset the counter*/
if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
break;
}
}

// string has some leading/trailing spaces and some non-space characters
*out_len = end - start + 1;
return data + start;
// when all characters in the basetext are part of the trimtext
if (end_ptr == -1) {
*out_len = 0;
return "";
}

end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character
*out_len = end_ptr;
return basetext;
}

// Trims characters present in the trim text from both ends of the base text
FORCE_INLINE
const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
gdv_int32 basetext_len, const char* trimtext,
gdv_int32 trimtext_len, int32_t* out_len) {
if (basetext_len == 0) {
*out_len = 0;
return "";
} else if (trimtext_len == 0) {
*out_len = basetext_len;
return basetext;
}

gdv_int32 start_ptr, end_ptr, char_len, byte_cnt = 1;
// scan the base text from left to right and increment the start and decrement the
// end pointers till there are characters which are not present in the trim text
for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
char_len = utf8_char_length(basetext[start_ptr]);
if (char_len == 0 || start_ptr + char_len > basetext_len) {
// invalid byte or incomplete glyph
set_error_for_invalid_utf(context, basetext[start_ptr]);
*out_len = 0;
return "";
}
if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, char_len)) {
break;
}
}
for (end_ptr = basetext_len - 1; end_ptr >= start_ptr; --end_ptr) {
char_len = utf8_char_length(basetext[end_ptr]);
if (char_len == 0) { // trailing byte in multibyte character
++byte_cnt;
continue;
}
// this is the first byte of a character, hence check if char_len = char_cnt
if (byte_cnt != char_len) { // invalid byte or incomplete glyph
set_error_for_invalid_utf(context, basetext[end_ptr]);
*out_len = 0;
return "";
}
byte_cnt = 1; // reset the counter*/
if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, char_len)) {
break;
}
}

// when all characters are trimmed, start_ptr has been incremented to basetext_len and
// end_ptr still points to basetext_len - 1, hence we need to handle this case
if (start_ptr > end_ptr) {
*out_len = 0;
return "";
}

end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next character
*out_len = end_ptr - start_ptr;
return basetext + start_ptr;
}

// Truncates the string to given length
Expand Down Expand Up @@ -680,7 +841,7 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
int32_t len) { \
gdv_##OUT_TYPE val = 0; \
int32_t trimmed_len; \
data = trim_utf8(context, data, len, &trimmed_len); \
data = btrim_utf8(context, data, len, &trimmed_len); \
if (!arrow::internal::StringConverter<ARROW_TYPE>::Convert(data, trimmed_len, \
&val)) { \
std::string err = "Failed to cast the string " + std::string(data, trimmed_len) + \
Expand Down

0 comments on commit baf2094

Please sign in to comment.