Skip to content

Commit

Permalink
ARROW-16527: [Gandiva][C++] Add binary functions
Browse files Browse the repository at this point in the history
This PR was created to implement binary functions in Gandiva side based on [Hive implementation](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFToBinary.java).

This PR implements the follow signatures:

FunctionSignature{name =binary, return type =binary, param types =[string]}
FunctionSignature{name =binary, return type =binary, param types =[binary]}

Closes #13073 from Johnnathanalmeida/feature/add-binary-function

Authored-by: Johnnathan <johnnathanalmeida@gmail.com>
Signed-off-by: Pindikura Ravindra <ravindra@dremio.com>
  • Loading branch information
Johnnathanalmeida authored and Pindikura Ravindra committed May 25, 2022
1 parent 3b92f02 commit f766159
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 0 deletions.
6 changes: 6 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Expand Up @@ -439,6 +439,12 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "right_utf8_int32",
NativeFunction::kNeedsContext),

NativeFunction("binary", {}, DataTypeVector{binary()}, binary(), kResultNullIfNull,
"castBINARY_binary"),

NativeFunction("binary", {}, DataTypeVector{utf8()}, binary(), kResultNullIfNull,
"castBINARY_utf8"),

NativeFunction("castVARBINARY", {}, DataTypeVector{binary(), int64()}, binary(),
kResultNullIfNull, "castVARBINARY_binary_int64",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops.cc
Expand Up @@ -705,6 +705,17 @@ CAST_VARCHAR_FROM_VARLEN_TYPE(binary)
CAST_VARBINARY_FROM_STRING_AND_BINARY(utf8)
CAST_VARBINARY_FROM_STRING_AND_BINARY(binary)

#define CAST_BINARY_FROM_STRING_AND_BINARY(TYPE) \
GANDIVA_EXPORT \
const char* castBINARY_##TYPE(const char* data, gdv_int32 data_len, \
int32_t* out_length) { \
*out_length = data_len; \
return data; \
}

CAST_BINARY_FROM_STRING_AND_BINARY(utf8)
CAST_BINARY_FROM_STRING_AND_BINARY(binary)

#undef CAST_VARBINARY_FROM_STRING_AND_BINARY

#define IS_NULL(NAME, TYPE) \
Expand Down
33 changes: 33 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Expand Up @@ -879,6 +879,39 @@ TEST(TestGdvFnStubs, TestCastVarbinaryBinary) {
ctx.Reset();
}

TEST(TestGdvFnStubs, TestCastBinaryUtf8) {
int32_t out_len = 0;
const char* input = "abc";
const char* out;

out = castBINARY_utf8(input, 3, &out_len);
EXPECT_EQ(std::string(out, out_len), input);

out = castBINARY_utf8(input, 2, &out_len);
EXPECT_EQ(std::string(out, out_len), "ab");

out = castBINARY_utf8(input, 1, &out_len);
EXPECT_EQ(std::string(out, out_len), "a");

out = castBINARY_utf8(input, 0, &out_len);
EXPECT_EQ(std::string(out, out_len), "");
}

TEST(TestGdvFnStubs, TestCastBinaryBinary) {
int32_t out_len = 0;
const char* input = "\\x41\\x42\\x43";
const char* out;

out = castBINARY_binary(input, 12, &out_len);
EXPECT_EQ(std::string(out, out_len), input);

out = castBINARY_binary(input, 8, &out_len);
EXPECT_EQ(std::string(out, out_len), "\\x41\\x42");

out = castBINARY_binary(input, 0, &out_len);
EXPECT_EQ(std::string(out, out_len), "");
}

TEST(TestStringOps, TestConcat) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/gandiva/precompiled/types.h
Expand Up @@ -573,6 +573,10 @@ const char* castVARBINARY_binary_int64(gdv_int64 context, const char* data,
gdv_int32 data_len, int64_t out_len,
int32_t* out_length);

const char* castBINARY_utf8(const char* data, gdv_int32 data_len, int32_t* out_length);

const char* castBINARY_binary(const char* data, gdv_int32 data_len, int32_t* out_length);

gdv_int32 levenshtein(int64_t context, const char* in1, int32_t in1_len, const char* in2,
int32_t in2_len);

Expand Down
107 changes: 107 additions & 0 deletions cpp/src/gandiva/tests/projector_test.cc
Expand Up @@ -2682,4 +2682,111 @@ TEST_F(TestProjector, TestRegexpExtract) {
EXPECT_ARROW_ARRAY_EQUALS(exp_extract, outputs.at(0));
}

TEST_F(TestProjector, TestCastVarbinary) {
auto field0 = field("f0", arrow::utf8());
auto field1 = field("f1", arrow::int64());
auto schema = arrow::schema({field0, field1});

// output fields
auto res_out1 = field("res_out1", arrow::binary());

// Build expression
auto cast_expr_1 =
TreeExprBuilder::MakeExpression("castVARBINARY", {field0, field1}, res_out1);

std::shared_ptr<Projector> projector;

auto status = Projector::Make(schema, {cast_expr_1}, TestConfiguration(), &projector);

EXPECT_TRUE(status.ok());

// Create a row-batch with some sample data
int num_records = 2;

auto array0 = MakeArrowArrayUtf8({"a", "abc"}, {true, true});

auto array1 = MakeArrowArrayInt64({1, 3}, {true, true});

auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1});

auto out_1 = MakeArrowArrayBinary({"a", "abc"}, {true, true});

arrow::ArrayVector outputs;

// Evaluate expression
status = projector->Evaluate(*in_batch, pool_, &outputs);
EXPECT_TRUE(status.ok());

EXPECT_ARROW_ARRAY_EQUALS(out_1, outputs.at(0));
}

TEST_F(TestProjector, TestCastBinaryUTF) {
auto field0 = field("f0", arrow::utf8());
auto schema = arrow::schema({field0});

// output fields
auto res_out1 = field("res_out1", arrow::binary());

// Build expression
auto cast_expr_1 = TreeExprBuilder::MakeExpression("binary", {field0}, res_out1);

std::shared_ptr<Projector> projector;

auto status = Projector::Make(schema, {cast_expr_1}, TestConfiguration(), &projector);

EXPECT_TRUE(status.ok());

// Create a row-batch with some sample data
int num_records = 3;

auto array0 = MakeArrowArrayUtf8({"a", "abc", ""}, {true, true, true});

auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});

auto out_1 = MakeArrowArrayBinary({"a", "abc", ""}, {true, true, true});

arrow::ArrayVector outputs;

// Evaluate expression
status = projector->Evaluate(*in_batch, pool_, &outputs);
EXPECT_TRUE(status.ok());

EXPECT_ARROW_ARRAY_EQUALS(out_1, outputs.at(0));
}

TEST_F(TestProjector, TestCastBinaryBinary) {
auto field0 = field("f0", arrow::binary());
auto schema = arrow::schema({field0});

// output fields
auto res_out1 = field("res_out1", arrow::binary());

// Build expression
auto cast_expr_1 = TreeExprBuilder::MakeExpression("binary", {field0}, res_out1);

std::shared_ptr<Projector> projector;

auto status = Projector::Make(schema, {cast_expr_1}, TestConfiguration(), &projector);

EXPECT_TRUE(status.ok());

// Create a row-batch with some sample data
int num_records = 3;

auto array0 =
MakeArrowArrayUtf8({"\\x41\\x42\\x43", "\\x41\\x42", ""}, {true, true, true});

auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});

auto out_1 =
MakeArrowArrayBinary({"\\x41\\x42\\x43", "\\x41\\x42", ""}, {true, true, true});

arrow::ArrayVector outputs;

// Evaluate expression
status = projector->Evaluate(*in_batch, pool_, &outputs);
EXPECT_TRUE(status.ok());

EXPECT_ARROW_ARRAY_EQUALS(out_1, outputs.at(0));
}
} // namespace gandiva

0 comments on commit f766159

Please sign in to comment.