From f25f0889d9bf8d9c22854420b08f4f370bc89d4b Mon Sep 17 00:00:00 2001 From: Christopher Wolff Date: Tue, 16 May 2023 15:35:57 -0700 Subject: [PATCH 1/2] feat: add pattern for simplifying exprs like `str ~ '^foo$'` --- .../simplify_expressions/expr_simplifier.rs | 15 +++++ .../src/simplify_expressions/regex.rs | 59 ++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 699e92a2085a..e161c522c8ac 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2434,6 +2434,21 @@ mod tests { // single word assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%")); + // regular expressions that match an exact literal + assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit(""))); + assert_change( + regex_not_match(col("c1"), lit("^$")), + col("c1").not_eq(lit("")), + ); + assert_change( + regex_match(col("c1"), lit("^foo$")), + col("c1").eq(lit("foo")), + ); + assert_change( + regex_not_match(col("c1"), lit("^foo$")), + col("c1").not_eq(lit("foo")), + ); + // OR-chain assert_change( regex_match(col("c1"), lit("foo|bar|baz")), diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index a7ae14542df6..33901299270e 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -16,8 +16,8 @@ // under the License. use datafusion_common::{DataFusionError, Result, ScalarValue}; -use datafusion_expr::{BinaryExpr, Expr, Like, Operator}; -use regex_syntax::hir::{Hir, HirKind, Literal}; +use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator}; +use regex_syntax::hir::{Hir, HirKind, Literal, Look}; /// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions. const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4; @@ -95,6 +95,15 @@ impl OperatorMode { Expr::Like(like) } } + + fn expr_matches_literal(&self, left: Box, right: Box) -> Expr { + let op = if self.not { + Operator::NotEq + } else { + Operator::Eq + }; + Expr::BinaryExpr(BinaryExpr { left, op, right }) + } } fn collect_concat_to_like_string(parts: &[Hir]) -> Option { @@ -130,6 +139,46 @@ fn is_safe_for_like(c: char) -> bool { (c != '%') && (c != '_') } +/// returns true if the elements in a `Concat` pattern are: +/// - `[Look::Start, Look::End]` +/// - `[Look::Start, Literal(_), Look::End]` +fn is_anchored_literal(v: &[Hir]) -> bool { + match v.len() { + 2..=3 => (), + _ => return false, + }; + + let first_last = ( + v.first().expect("length checked"), + v.last().expect("length checked"), + ); + if !matches!(first_last, + (s, e) if s.kind() == &HirKind::Look(Look::Start) + && e.kind() == &HirKind::Look(Look::End) + ) + { + return false; + } + + v.iter() + .skip(1) + .take(v.len() - 2) + .all(|h| matches!(h.kind(), HirKind::Literal(_))) +} + +/// extracts a string literal expression assuming that `is_anchored_literal()` +/// returned true. +fn anchored_literal_to_expr(v: &[Hir]) -> Option { + match v.len() { + 2 => Some(lit("")), + 3 => { + let HirKind::Literal(l) = v[1].kind() else { return None }; + str_from_literal(l).map(lit) + } + _ => None, + } +} + fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { println!("Considering hir kind: mode {mode:?} hir: {hir:?}"); match hir.kind() { @@ -140,6 +189,12 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option { let s = str_from_literal(l)?; return Some(mode.expr(Box::new(left.clone()), format!("%{s}%"))); } + HirKind::Concat(inner) if is_anchored_literal(inner) => { + let right = anchored_literal_to_expr(inner)?; + return Some( + mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)), + ); + } HirKind::Concat(inner) => { if let Some(pattern) = collect_concat_to_like_string(inner) { return Some(mode.expr(Box::new(left.clone()), pattern)); From b154c294b17644f41421be27967857b378c94a09 Mon Sep 17 00:00:00 2001 From: Christopher Wolff Date: Wed, 17 May 2023 10:04:50 -0700 Subject: [PATCH 2/2] test: add additional tests --- .../simplify_expressions/expr_simplifier.rs | 19 +++++++++++++++++++ .../src/simplify_expressions/regex.rs | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index e161c522c8ac..75f50aa3c576 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2448,6 +2448,12 @@ mod tests { regex_not_match(col("c1"), lit("^foo$")), col("c1").not_eq(lit("foo")), ); + assert_no_change(regex_match(col("c1"), lit("^foo|bar$"))); + assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$"))); + assert_no_change(regex_match(col("c1"), lit("^"))); + assert_no_change(regex_match(col("c1"), lit("$"))); + assert_no_change(regex_match(col("c1"), lit("$^"))); + assert_no_change(regex_match(col("c1"), lit("$foo^"))); // OR-chain assert_change( @@ -2468,6 +2474,19 @@ mod tests { .and(not_like(col("c1"), "%bar%")) .and(not_like(col("c1"), "%baz%")), ); + // both anchored expressions (translated to equality) and unanchored + assert_change( + regex_match(col("c1"), lit("foo|^x$|baz")), + like(col("c1"), "%foo%") + .or(col("c1").eq(lit("x"))) + .or(like(col("c1"), "%baz%")), + ); + assert_change( + regex_not_match(col("c1"), lit("foo|^bar$|baz")), + not_like(col("c1"), "%foo%") + .and(col("c1").not_eq(lit("bar"))) + .and(not_like(col("c1"), "%baz%")), + ); // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION) assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc"))); } diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 33901299270e..35f6dcaef09c 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -166,7 +166,7 @@ fn is_anchored_literal(v: &[Hir]) -> bool { .all(|h| matches!(h.kind(), HirKind::Literal(_))) } -/// extracts a string literal expression assuming that `is_anchored_literal()` +/// extracts a string literal expression assuming that [`is_anchored_literal`] /// returned true. fn anchored_literal_to_expr(v: &[Hir]) -> Option { match v.len() {