Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 70 additions & 2 deletions datafusion/functions/src/regex/regexplike.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@ use datafusion_common::{
ScalarValue,
};
use datafusion_expr::{
Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature,
TypeSignatureClass, Volatility,
binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl,
Signature, TypeSignature, TypeSignatureClass, Volatility,
};
use datafusion_macros::user_doc;

use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
use datafusion_expr_common::operator::Operator;
use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -153,11 +156,76 @@ impl ScalarUDFImpl for RegexpLikeFunc {
}
}

fn simplify(
&self,
mut args: Vec<Expr>,
info: &dyn SimplifyInfo,
) -> Result<ExprSimplifyResult> {
// Try to simplify regexp_like usage to one of the builtin operators since those have
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you -- these comments really help

// optimized code paths for the case where the regular expression pattern is a scalar.
// Additionally, the expression simplification optimization pass will attempt to further
// simplify regular expression patterns used in operator expressions.
let Some(op) = derive_operator(&args) else {
return Ok(ExprSimplifyResult::Original(args));
};

let string_type = info.get_data_type(&args[0])?;
let regexp_type = info.get_data_type(&args[1])?;
let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, &regexp_type);
let Ok((coerced_string_type, coerced_regexp_type)) =
binary_type_coercer.get_input_types()
else {
return Ok(ExprSimplifyResult::Original(args));
};

// regexp_like(str, regexp [, flags])
let regexp = args.swap_remove(1);
let string = args.swap_remove(0);

Ok(ExprSimplifyResult::Simplified(binary_expr(
if string_type != coerced_string_type {
cast(string, coerced_string_type)
} else {
string
},
op,
if regexp_type != coerced_regexp_type {
cast(regexp, coerced_regexp_type)
} else {
regexp
},
)))
}

fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
}

fn derive_operator(args: &[Expr]) -> Option<Operator> {
match args.len() {
// regexp_like(str, regexp, flags)
3 => {
match &args[2] {
Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
match flags.as_str() {
"i" => Some(Operator::RegexIMatch),
"" => Some(Operator::RegexMatch),
// Any flags besides 'i' have no operator equivalent
_ => None,
}
}
// `flags` is not a literal, so we can't derive the correct operator statically
_ => None,
}
}
// regexp_like(str, regexp)
2 => Some(Operator::RegexMatch),
// Should never happen, but just in case
_ => None,
}
}

/// Tests a string using a regular expression returning true if at
/// least one match, false otherwise.
///
Expand Down
60 changes: 60 additions & 0 deletions datafusion/sqllogictest/test_files/regexp/regexp_like.slt
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,63 @@ drop table strings

statement ok
drop table dict_table

# Ensure that regexp_like is rewritten to use the (more optimized) regex operators
statement ok
create table regexp_test as values
('foobar', 'i'),
('Foo', 'i'),
('bar', 'mi') ;

# Expressions that can be rewritten to use the ~ operator (which is more optimized)
# (expect the plans to use the ~ / ~* operators, not the REGEXP_LIKE function)
query TT
explain select
regexp_like(column1, 'fo.*'),
regexp_like(column1, 'fo.*', 'i'),
from regexp_test;
----
logical_plan
01)Projection: regexp_test.column1 ~ Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*")), regexp_test.column1 ~* Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))
02)--TableScan: regexp_test projection=[column1]
physical_plan
01)ProjectionExec: expr=[column1@0 ~ fo.* as regexp_like(regexp_test.column1,Utf8("fo.*")), column1@0 ~* fo.* as regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))]
02)--DataSourceExec: partitions=1, partition_sizes=[1]

query BB
select
regexp_like(column1, 'fo.*'),
regexp_like(column1, 'fo.*', 'i'),
from regexp_test;
----
true true
false true
false false

# Expressions that can not be rewritten to use the ~ / ~* operators
# (expect the plans to use the REGEXP_LIKE function)
query TT
explain select
regexp_like(column1, 'f.*r', 'mi'), -- args
regexp_like(column1, 'f.*r', column2) -- non scalar flags
from regexp_test;
----
logical_plan
01)Projection: regexp_like(regexp_test.column1, Utf8("f.*r"), Utf8("mi")), regexp_like(regexp_test.column1, Utf8("f.*r"), regexp_test.column2)
02)--TableScan: regexp_test projection=[column1, column2]
physical_plan
01)ProjectionExec: expr=[regexp_like(column1@0, f.*r, mi) as regexp_like(regexp_test.column1,Utf8("f.*r"),Utf8("mi")), regexp_like(column1@0, f.*r, column2@1) as regexp_like(regexp_test.column1,Utf8("f.*r"),regexp_test.column2)]
02)--DataSourceExec: partitions=1, partition_sizes=[1]

query BB
select
regexp_like(column1, 'f.*r', 'mi'), -- args
regexp_like(column1, 'f.*r', column2) -- non scalar flags
from regexp_test;
----
true true
false false
false false

statement ok
drop table if exists dict_table;
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: regexp_like(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k
01)Projection: test.column1_utf8view ~ Utf8View("^https?://(?:www\.)?([^/]+)/.*$") AS k
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If ~ is faster than regexp_like can we simply change the implementation to use the same underlying implementation of ~ (why only rewrite in some cases?)

Copy link
Contributor Author

@pepijnve pepijnve Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #17838 (comment)

The operator logic is in physical_expr, while regexp_like lives in functions. We would probably have to move the common logic to a separate crate. This PR was intended as a stopgap solution for common cases.

We can only rewrite in some cases because of the optional flags argument. With the operators all you have is the case sensitivity (i.e. the iflag).

The reason for the operator being more efficient is that it will make use of the regexp_is_match_scalar kernel if it can, while regexp_like always uses regexp_is_match. regexp_is_match does maintain a cache of compiled regexes so at least the pattern isn't compiled over and over again, but it's still quite a bit more code compared to regexp_is_match_scalar.

Additionally there's a regular expression simplification rule that only operates on BinaryExpr with one of the regex matching operators. The transformation here enables that optimisation for regexp_like calls as well.

02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for REGEXP_MATCH
Expand Down