From 6ef1fd3458cf4d03c6c9eeb2eb094cfed39c1389 Mon Sep 17 00:00:00 2001 From: theirix Date: Mon, 2 Sep 2024 22:58:11 +0100 Subject: [PATCH 1/4] Handle SIMILAR TO for physical plan --- .../physical-expr/src/expressions/binary.rs | 16 ++++++++++++++++ .../physical-expr/src/expressions/mod.rs | 2 +- datafusion/physical-expr/src/planner.rs | 18 +++++++++++++++++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 08c133d7193a..aa8f1f067f8f 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -681,6 +681,22 @@ pub fn binary( Ok(Arc::new(BinaryExpr::new(lhs, op, rhs))) } +/// Create a similar to expression +pub fn similar_to( + negated: bool, + case_insensitive: bool, + expr: Arc, + pattern: Arc, +) -> Result> { + let binary_op = match (negated, case_insensitive) { + (false, false) => Operator::RegexMatch, + (false, true) => Operator::RegexIMatch, + (true, false) => Operator::RegexNotMatch, + (true, true) => Operator::RegexNotIMatch, + }; + Ok(Arc::new(BinaryExpr::new(expr, binary_op, pattern))) +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 87d8f04a6858..177fd799ae79 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -42,7 +42,7 @@ pub use crate::window::ntile::Ntile; pub use crate::window::rank::{dense_rank, percent_rank, rank, Rank, RankType}; pub use crate::PhysicalSortExpr; -pub use binary::{binary, BinaryExpr}; +pub use binary::{binary, similar_to, BinaryExpr}; pub use case::{case, CaseExpr}; pub use cast::{cast, CastExpr}; pub use column::{col, with_new_schema, Column}; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index d015f545bf9d..bffc2c46fc1e 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use crate::scalar_function; use crate::{ - expressions::{self, binary, like, Column, Literal}, + expressions::{self, binary, like, similar_to, Column, Literal}, PhysicalExpr, }; @@ -215,6 +215,22 @@ pub fn create_physical_expr( input_schema, ) } + Expr::SimilarTo(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { + if escape_char.is_some() { + return exec_err!("SIMILAR TO does not support escape_char yet"); + } + let physical_expr = + create_physical_expr(expr, input_dfschema, execution_props)?; + let physical_pattern = + create_physical_expr(pattern, input_dfschema, execution_props)?; + similar_to(*negated, *case_insensitive, physical_expr, physical_pattern) + } Expr::Case(case) => { let expr: Option> = if let Some(e) = &case.expr { Some(create_physical_expr( From 9279beec8b355dd2be34051abbebb67e8cfe1213 Mon Sep 17 00:00:00 2001 From: theirix Date: Mon, 2 Sep 2024 23:01:13 +0100 Subject: [PATCH 2/4] Add sqllogictest test cases --- .../sqllogictest/test_files/strings.slt | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/datafusion/sqllogictest/test_files/strings.slt b/datafusion/sqllogictest/test_files/strings.slt index 30fb2d750d95..81b8f4b2da9a 100644 --- a/datafusion/sqllogictest/test_files/strings.slt +++ b/datafusion/sqllogictest/test_files/strings.slt @@ -46,6 +46,51 @@ P1m1e1 p1m1e1 p2m1e1 +# REGEX +query T rowsort +SELECT s FROM test WHERE s ~ 'p[12].*'; +---- +p1 +p1e1 +p1m1e1 +p2 +p2e1 +p2m1e1 + +# REGEX nocase +query T rowsort +SELECT s FROM test WHERE s ~* 'p[12].*'; +---- +P1 +P1e1 +P1m1e1 +p1 +p1e1 +p1m1e1 +p2 +p2e1 +p2m1e1 + +# SIMILAR TO +query T rowsort +SELECT s FROM test WHERE s SIMILAR TO 'p[12].*'; +---- +p1 +p1e1 +p1m1e1 +p2 +p2e1 +p2m1e1 + +# NOT SIMILAR TO +query T rowsort +SELECT s FROM test WHERE s NOT SIMILAR TO 'p[12].*'; +---- +P1 +P1e1 +P1m1e1 +e1 + # NOT LIKE query T rowsort SELECT s FROM test WHERE s NOT LIKE 'p1%'; From 5d6b1b9ebe5864ee5def731393eca131a3231b48 Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 5 Sep 2024 19:44:32 +0100 Subject: [PATCH 3/4] Add unit tests for similar to --- .../physical-expr/src/expressions/binary.rs | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index aa8f1f067f8f..890bdcb8c817 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -4242,4 +4242,62 @@ mod tests { .contains("Overflow happened on: 2147483647 * 2")); Ok(()) } + + /// Test helper for SIMILAR TO binary operation + fn apply_similar_to( + schema: &SchemaRef, + va: Vec<&str>, + vb: Vec<&str>, + negated: bool, + case_insensitive: bool, + expected: &BooleanArray, + ) -> Result<()> { + let a = StringArray::from(va); + let b = StringArray::from(vb); + let op = similar_to( + negated, + case_insensitive, + col("a", schema)?, + col("b", schema)?, + ); + let batch = + RecordBatch::try_new(Arc::clone(schema), vec![Arc::new(a), Arc::new(b)])?; + let result = op + .evaluate(&batch)? + .into_array(batch.num_rows()) + .expect("Failed to convert to array"); + assert_eq!(result.as_ref(), expected); + + Ok(()) + } + + #[test] + fn test_similar_to() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ])); + + let expected = [true, false].iter().collect(); + // case-sensitive + apply_similar_to( + &schema, + vec!["hello world", "Hello World"], + vec!["hello.*", "hello.*"], + false, + false, + &expected, + ) + .unwrap(); + // case-insensitive + apply_similar_to( + &schema, + vec!["hello world", "bye"], + vec!["hello.*", "hello.*"], + false, + true, + &expected, + ) + .unwrap(); + } } From 6307662dca6909c51a514ed4b978c30fb72fc45b Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 5 Sep 2024 21:11:55 +0100 Subject: [PATCH 4/4] Fix type error --- datafusion/physical-expr/src/expressions/binary.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 890bdcb8c817..0bc9c2c23b57 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -4259,7 +4259,7 @@ mod tests { case_insensitive, col("a", schema)?, col("b", schema)?, - ); + )?; let batch = RecordBatch::try_new(Arc::clone(schema), vec![Arc::new(a), Arc::new(b)])?; let result = op @@ -4278,7 +4278,7 @@ mod tests { Field::new("b", DataType::Utf8, false), ])); - let expected = [true, false].iter().collect(); + let expected = [Some(true), Some(false)].iter().collect(); // case-sensitive apply_similar_to( &schema,