diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index c1b6ef1648643..3140727a539f0 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -439,6 +439,12 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "right_utf8_int32", NativeFunction::kNeedsContext), + NativeFunction("binary", {}, DataTypeVector{binary()}, binary(), kResultNullIfNull, + "castBINARY_binary"), + + NativeFunction("binary", {}, DataTypeVector{utf8()}, binary(), kResultNullIfNull, + "castBINARY_utf8"), + NativeFunction("castVARBINARY", {}, DataTypeVector{binary(), int64()}, binary(), kResultNullIfNull, "castVARBINARY_binary_int64", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 5c369cc6f4f83..2cbb1f0110a5e 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -705,6 +705,17 @@ CAST_VARCHAR_FROM_VARLEN_TYPE(binary) CAST_VARBINARY_FROM_STRING_AND_BINARY(utf8) CAST_VARBINARY_FROM_STRING_AND_BINARY(binary) +#define CAST_BINARY_FROM_STRING_AND_BINARY(TYPE) \ + GANDIVA_EXPORT \ + const char* castBINARY_##TYPE(const char* data, gdv_int32 data_len, \ + int32_t* out_length) { \ + *out_length = data_len; \ + return data; \ + } + +CAST_BINARY_FROM_STRING_AND_BINARY(utf8) +CAST_BINARY_FROM_STRING_AND_BINARY(binary) + #undef CAST_VARBINARY_FROM_STRING_AND_BINARY #define IS_NULL(NAME, TYPE) \ diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index a9bd68e29d299..2f8212d8b8d61 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -879,6 +879,39 @@ TEST(TestGdvFnStubs, TestCastVarbinaryBinary) { ctx.Reset(); } +TEST(TestGdvFnStubs, TestCastBinaryUtf8) { + int32_t out_len = 0; + const char* input = "abc"; + const char* out; + + out = castBINARY_utf8(input, 3, &out_len); + EXPECT_EQ(std::string(out, out_len), input); + + out = castBINARY_utf8(input, 2, &out_len); + EXPECT_EQ(std::string(out, out_len), "ab"); + + out = castBINARY_utf8(input, 1, &out_len); + EXPECT_EQ(std::string(out, out_len), "a"); + + out = castBINARY_utf8(input, 0, &out_len); + EXPECT_EQ(std::string(out, out_len), ""); +} + +TEST(TestGdvFnStubs, TestCastBinaryBinary) { + int32_t out_len = 0; + const char* input = "\\x41\\x42\\x43"; + const char* out; + + out = castBINARY_binary(input, 12, &out_len); + EXPECT_EQ(std::string(out, out_len), input); + + out = castBINARY_binary(input, 8, &out_len); + EXPECT_EQ(std::string(out, out_len), "\\x41\\x42"); + + out = castBINARY_binary(input, 0, &out_len); + EXPECT_EQ(std::string(out, out_len), ""); +} + TEST(TestStringOps, TestConcat) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 57a2718c81bd1..f7512ba2b5cc8 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -573,6 +573,10 @@ const char* castVARBINARY_binary_int64(gdv_int64 context, const char* data, gdv_int32 data_len, int64_t out_len, int32_t* out_length); +const char* castBINARY_utf8(const char* data, gdv_int32 data_len, int32_t* out_length); + +const char* castBINARY_binary(const char* data, gdv_int32 data_len, int32_t* out_length); + gdv_int32 levenshtein(int64_t context, const char* in1, int32_t in1_len, const char* in2, int32_t in2_len); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 893319280cd83..44ae5fc6f9b12 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -2682,4 +2682,111 @@ TEST_F(TestProjector, TestRegexpExtract) { EXPECT_ARROW_ARRAY_EQUALS(exp_extract, outputs.at(0)); } +TEST_F(TestProjector, TestCastVarbinary) { + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::int64()); + auto schema = arrow::schema({field0, field1}); + + // output fields + auto res_out1 = field("res_out1", arrow::binary()); + + // Build expression + auto cast_expr_1 = + TreeExprBuilder::MakeExpression("castVARBINARY", {field0, field1}, res_out1); + + std::shared_ptr projector; + + auto status = Projector::Make(schema, {cast_expr_1}, TestConfiguration(), &projector); + + EXPECT_TRUE(status.ok()); + + // Create a row-batch with some sample data + int num_records = 2; + + auto array0 = MakeArrowArrayUtf8({"a", "abc"}, {true, true}); + + auto array1 = MakeArrowArrayInt64({1, 3}, {true, true}); + + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); + + auto out_1 = MakeArrowArrayBinary({"a", "abc"}, {true, true}); + + arrow::ArrayVector outputs; + + // Evaluate expression + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + EXPECT_ARROW_ARRAY_EQUALS(out_1, outputs.at(0)); +} + +TEST_F(TestProjector, TestCastBinaryUTF) { + auto field0 = field("f0", arrow::utf8()); + auto schema = arrow::schema({field0}); + + // output fields + auto res_out1 = field("res_out1", arrow::binary()); + + // Build expression + auto cast_expr_1 = TreeExprBuilder::MakeExpression("binary", {field0}, res_out1); + + std::shared_ptr projector; + + auto status = Projector::Make(schema, {cast_expr_1}, TestConfiguration(), &projector); + + EXPECT_TRUE(status.ok()); + + // Create a row-batch with some sample data + int num_records = 3; + + auto array0 = MakeArrowArrayUtf8({"a", "abc", ""}, {true, true, true}); + + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0}); + + auto out_1 = MakeArrowArrayBinary({"a", "abc", ""}, {true, true, true}); + + arrow::ArrayVector outputs; + + // Evaluate expression + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + EXPECT_ARROW_ARRAY_EQUALS(out_1, outputs.at(0)); +} + +TEST_F(TestProjector, TestCastBinaryBinary) { + auto field0 = field("f0", arrow::binary()); + auto schema = arrow::schema({field0}); + + // output fields + auto res_out1 = field("res_out1", arrow::binary()); + + // Build expression + auto cast_expr_1 = TreeExprBuilder::MakeExpression("binary", {field0}, res_out1); + + std::shared_ptr projector; + + auto status = Projector::Make(schema, {cast_expr_1}, TestConfiguration(), &projector); + + EXPECT_TRUE(status.ok()); + + // Create a row-batch with some sample data + int num_records = 3; + + auto array0 = + MakeArrowArrayUtf8({"\\x41\\x42\\x43", "\\x41\\x42", ""}, {true, true, true}); + + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0}); + + auto out_1 = + MakeArrowArrayBinary({"\\x41\\x42\\x43", "\\x41\\x42", ""}, {true, true, true}); + + arrow::ArrayVector outputs; + + // Evaluate expression + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + EXPECT_ARROW_ARRAY_EQUALS(out_1, outputs.at(0)); +} } // namespace gandiva