From 99c455eadd7a7c0aaccab633e709ec81dbcd05d6 Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Tue, 30 Sep 2025 15:37:35 +0200 Subject: [PATCH 1/4] #17838 Add simplify implementation for regexp_like that rewrites as operator expressions when possible --- datafusion/functions/src/regex/regexplike.rs | 54 ++++++++++++++++++- .../test_files/string/string_view.slt | 2 +- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 0554844d11c1..31252aff64fa 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -27,11 +27,14 @@ use datafusion_common::{ ScalarValue, }; use datafusion_expr::{ - Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, - TypeSignatureClass, Volatility, + binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl, + Signature, TypeSignature, TypeSignatureClass, Volatility, }; use datafusion_macros::user_doc; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; +use datafusion_expr_common::operator::Operator; +use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer; use std::any::Any; use std::sync::Arc; @@ -153,6 +156,53 @@ impl ScalarUDFImpl for RegexpLikeFunc { } } + fn simplify( + &self, + args: Vec, + info: &dyn SimplifyInfo, + ) -> Result { + // Try to simplify regexp_like to ~ or ~* if possible since the implementation of those operators + // is more optimised. + let Some((st, op, re)) = (match args.as_slice() { + [string, regexp] => { + Some((string.clone(), Operator::RegexMatch, regexp.clone())) + } + [string, regexp, Expr::Literal(ScalarValue::Utf8(Some(flags)), _)] => { + match flags.as_str() { + "i" => Some((string.clone(), Operator::RegexIMatch, regexp.clone())), + "" => Some((string.clone(), Operator::RegexMatch, regexp.clone())), + _ => None, + } + } + _ => None, + }) else { + return Ok(ExprSimplifyResult::Original(args)); + }; + + let st_type = info.get_data_type(&st)?; + let re_type = info.get_data_type(&re)?; + let binary_type_coercer = BinaryTypeCoercer::new(&st_type, &op, &re_type); + let Ok((coerced_st_type, coerced_re_type)) = + binary_type_coercer.get_input_types() + else { + return Ok(ExprSimplifyResult::Original(args)); + }; + + Ok(ExprSimplifyResult::Simplified(binary_expr( + if st_type != coerced_st_type { + cast(st, coerced_st_type) + } else { + st + }, + op, + if re_type != coerced_re_type { + cast(re, coerced_re_type) + } else { + re + }, + ))) + } + fn documentation(&self) -> Option<&Documentation> { self.doc() } diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 7d10a0615d45..fb67daa0b840 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -784,7 +784,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: regexp_like(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +01)Projection: test.column1_utf8view ~ Utf8View("^https?://(?:www\.)?([^/]+)/.*$") AS k 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for REGEXP_MATCH From 7930bcf82569c55d180cf5be8ae910aa18fd8f02 Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Fri, 3 Oct 2025 18:45:21 +0200 Subject: [PATCH 2/4] Avoid cloning Exprs --- datafusion/functions/src/regex/regexplike.rs | 67 ++++++++++++-------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 31252aff64fa..bdca1b5cc90b 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -158,47 +158,38 @@ impl ScalarUDFImpl for RegexpLikeFunc { fn simplify( &self, - args: Vec, + mut args: Vec, info: &dyn SimplifyInfo, ) -> Result { - // Try to simplify regexp_like to ~ or ~* if possible since the implementation of those operators - // is more optimised. - let Some((st, op, re)) = (match args.as_slice() { - [string, regexp] => { - Some((string.clone(), Operator::RegexMatch, regexp.clone())) - } - [string, regexp, Expr::Literal(ScalarValue::Utf8(Some(flags)), _)] => { - match flags.as_str() { - "i" => Some((string.clone(), Operator::RegexIMatch, regexp.clone())), - "" => Some((string.clone(), Operator::RegexMatch, regexp.clone())), - _ => None, - } - } - _ => None, - }) else { + // Try to simplify regexp_like to an operator expression since those are more optimised. + let Some(op) = derive_operator(&args) else { return Ok(ExprSimplifyResult::Original(args)); }; - let st_type = info.get_data_type(&st)?; - let re_type = info.get_data_type(&re)?; - let binary_type_coercer = BinaryTypeCoercer::new(&st_type, &op, &re_type); - let Ok((coerced_st_type, coerced_re_type)) = + let string_type = info.get_data_type(&args[0])?; + let regexp_type = info.get_data_type(&args[1])?; + let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, ®exp_type); + let Ok((coerced_string_type, coerced_regexp_type)) = binary_type_coercer.get_input_types() else { return Ok(ExprSimplifyResult::Original(args)); }; + // regexp_like(str, regexp [, flags]) + let regexp = args.swap_remove(1); + let string = args.swap_remove(0); + Ok(ExprSimplifyResult::Simplified(binary_expr( - if st_type != coerced_st_type { - cast(st, coerced_st_type) + if string_type != coerced_string_type { + cast(string, coerced_string_type) } else { - st + string }, op, - if re_type != coerced_re_type { - cast(re, coerced_re_type) + if regexp_type != coerced_regexp_type { + cast(regexp, coerced_regexp_type) } else { - re + regexp }, ))) } @@ -208,6 +199,30 @@ impl ScalarUDFImpl for RegexpLikeFunc { } } +fn derive_operator(args: &[Expr]) -> Option { + match args.len() { + // regexp_like(str, regexp, flags) + 3 => { + match &args[2] { + Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => { + match flags.as_str() { + "i" => Some(Operator::RegexIMatch), + "" => Some(Operator::RegexMatch), + // Any flags besides 'i' have no operator equivalent + _ => None, + } + } + // The flags value is not a literal, so we can't derive the operator statically + _ => None, + } + } + // regexp_like(str, regexp) + 2 => Some(Operator::RegexMatch), + // Should never happen, but just in case + _ => None, + } +} + /// Tests a string using a regular expression returning true if at /// least one match, false otherwise. /// From a35ba527dbf88ed9c4ea5dbbc8131253402ca286 Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Fri, 3 Oct 2025 22:12:56 +0200 Subject: [PATCH 3/4] Tweak code comments --- datafusion/functions/src/regex/regexplike.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index bdca1b5cc90b..d75eb9141c05 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -161,7 +161,10 @@ impl ScalarUDFImpl for RegexpLikeFunc { mut args: Vec, info: &dyn SimplifyInfo, ) -> Result { - // Try to simplify regexp_like to an operator expression since those are more optimised. + // Try to simplify regexp_like usage to one of the builtin operators since those have + // optimized code paths for the case where the regular expression pattern is a scalar. + // Additionally, the expression simplification optimization pass will attempt to further + // simplify regular expression patterns used in operator expressions. let Some(op) = derive_operator(&args) else { return Ok(ExprSimplifyResult::Original(args)); }; @@ -212,7 +215,7 @@ fn derive_operator(args: &[Expr]) -> Option { _ => None, } } - // The flags value is not a literal, so we can't derive the operator statically + // `flags` is not a literal, so we can't derive the correct operator statically _ => None, } } From 6b8314850cdf3f8e941bb9541ba1272438ea30cd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 6 Oct 2025 08:43:13 -0400 Subject: [PATCH 4/4] Add some more sqllogictests --- .../test_files/regexp/regexp_like.slt | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt index 223ef22b9861..dd42511eade9 100644 --- a/datafusion/sqllogictest/test_files/regexp/regexp_like.slt +++ b/datafusion/sqllogictest/test_files/regexp/regexp_like.slt @@ -277,3 +277,63 @@ drop table strings statement ok drop table dict_table + +# Ensure that regexp_like is rewritten to use the (more optimized) regex operators +statement ok +create table regexp_test as values + ('foobar', 'i'), + ('Foo', 'i'), + ('bar', 'mi') ; + +# Expressions that can be rewritten to use the ~ operator (which is more optimized) +# (expect the plans to use the ~ / ~* operators, not the REGEXP_LIKE function) +query TT +explain select + regexp_like(column1, 'fo.*'), + regexp_like(column1, 'fo.*', 'i'), +from regexp_test; +---- +logical_plan +01)Projection: regexp_test.column1 ~ Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*")), regexp_test.column1 ~* Utf8("fo.*") AS regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i")) +02)--TableScan: regexp_test projection=[column1] +physical_plan +01)ProjectionExec: expr=[column1@0 ~ fo.* as regexp_like(regexp_test.column1,Utf8("fo.*")), column1@0 ~* fo.* as regexp_like(regexp_test.column1,Utf8("fo.*"),Utf8("i"))] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query BB +select + regexp_like(column1, 'fo.*'), + regexp_like(column1, 'fo.*', 'i'), +from regexp_test; +---- +true true +false true +false false + +# Expressions that can not be rewritten to use the ~ / ~* operators +# (expect the plans to use the REGEXP_LIKE function) +query TT +explain select + regexp_like(column1, 'f.*r', 'mi'), -- args + regexp_like(column1, 'f.*r', column2) -- non scalar flags +from regexp_test; +---- +logical_plan +01)Projection: regexp_like(regexp_test.column1, Utf8("f.*r"), Utf8("mi")), regexp_like(regexp_test.column1, Utf8("f.*r"), regexp_test.column2) +02)--TableScan: regexp_test projection=[column1, column2] +physical_plan +01)ProjectionExec: expr=[regexp_like(column1@0, f.*r, mi) as regexp_like(regexp_test.column1,Utf8("f.*r"),Utf8("mi")), regexp_like(column1@0, f.*r, column2@1) as regexp_like(regexp_test.column1,Utf8("f.*r"),regexp_test.column2)] +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query BB +select + regexp_like(column1, 'f.*r', 'mi'), -- args + regexp_like(column1, 'f.*r', column2) -- non scalar flags +from regexp_test; +---- +true true +false false +false false + +statement ok +drop table if exists dict_table;