Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add regexp_like, improve docs and examples for regexp_match` #9137

Merged
merged 8 commits into from
Feb 9, 2024
250 changes: 204 additions & 46 deletions datafusion-examples/examples/regexp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,9 @@
// specific language governing permissions and limitations
// under the License.

use std::sync::Arc;

use arrow::array::BooleanArray;

use datafusion::arrow::datatypes::{DataType, Field, Schema};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::assert_contains;
use datafusion_common::{assert_batches_eq, assert_contains};

/// This example demonstrates how to use the regexp_* functions
///
Expand All @@ -49,29 +43,83 @@ async fn main() -> Result<()> {
//regexp_like examples
//
//
// regexp_like format is (regexp_replace(text, regex[, flags])
// regexp_like format is (regexp_like(text, regex[, flags])
//

// use sql and regexp_like function to test col 'values', against patterns in col 'patterns' without flags
let df = ctx
let result = ctx
.sql("select regexp_like(values, patterns) from examples")
.await?
.collect()
.await?;

// print the results
df.show().await?;

// use dataframe and regexp_like function to test col 'values', against patterns in col 'patterns' with flags
let df = ctx
assert_batches_eq!(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

&[
"+------------------------------------------------+",
"| regexp_like(examples.values,examples.patterns) |",
"+------------------------------------------------+",
"| true |",
"| true |",
"| false |",
"| false |",
"| false |",
"| false |",
"| true |",
"| true |",
"| true |",
"| true |",
"| true |",
"+------------------------------------------------+",
],
&result
);

// use sql and regexp_like function to test col 'values', against patterns in col 'patterns' with flags
let result = ctx
.sql("select regexp_like(values, patterns, flags) from examples")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+---------------------------------------------------------------+",
"| regexp_like(examples.values,examples.patterns,examples.flags) |",
"+---------------------------------------------------------------+",
"| true |",
"| true |",
"| true |",
"| false |",
"| false |",
"| false |",
"| true |",
"| true |",
"| true |",
"| true |",
"| true |",
"+---------------------------------------------------------------+",
],
&result
);

// literals work as well
// to match against the entire input use ^ and $ in the regex
let df = ctx.sql("select regexp_like('John Smith', '^.*Smith$'), regexp_like('Smith Jones', '^Smith.*$')").await?;
let result = ctx
.sql("select regexp_like('John Smith', '^.*Smith$'), regexp_like('Smith Jones', '^Smith.*$')")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+---------------------------------------------------+----------------------------------------------------+",
"| regexp_like(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_like(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
"+---------------------------------------------------+----------------------------------------------------+",
"| true | true |",
"+---------------------------------------------------+----------------------------------------------------+",
],
&result
);

// look-around and back references are not supported for performance
// reasons.
Expand All @@ -83,17 +131,20 @@ async fn main() -> Result<()> {
.collect()
.await;

let expected = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("a", DataType::Boolean, false)])),
vec![Arc::new(BooleanArray::from(vec![false]))],
)
.unwrap();

assert!(result.is_ok());
let result = result.unwrap();

assert_eq!(result.len(), 1);
assert_eq!(format!("{:?}", result[0]), format!("{expected:?}"));

assert_batches_eq!(
&[
"+-------+",
"| a |",
"+-------+",
"| false |",
"+-------+",
],
&result
);

// invalid flags will result in an error
let result = ctx
Expand All @@ -120,68 +171,175 @@ async fn main() -> Result<()> {
//regexp_match examples
//
//
// regexp_match format is (regexp_replace(text, regex[, flags])
// regexp_match format is (regexp_match(text, regex[, flags])
//

let df = ctx.table("examples").await?;

df.show().await?;
let _ = ctx.table("examples").await?;

// use sql and regexp_match function to test col 'values', against patterns in col 'patterns' without flags
let df = ctx
let result = ctx
.sql("select regexp_match(values, patterns) from examples")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+-------------------------------------------------+",
"| regexp_match(examples.values,examples.patterns) |",
"+-------------------------------------------------+",
"| [a] |",
"| [A] |",
"| |",
"| |",
"| |",
"| |",
"| [010] |",
"| [Düsseldorf] |",
"| [Москва] |",
"| [Köln] |",
"| [اليوم] |",
"+-------------------------------------------------+",
],
&result
);

// use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' with flags
let df = ctx
let result = ctx
.sql("select regexp_match(values, patterns, flags) from examples")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+----------------------------------------------------------------+",
"| regexp_match(examples.values,examples.patterns,examples.flags) |",
"+----------------------------------------------------------------+",
"| [a] |",
"| [A] |",
"| [B] |",
"| |",
"| |",
"| |",
"| [010] |",
"| [Düsseldorf] |",
"| [Москва] |",
"| [Köln] |",
"| [اليوم] |",
"+----------------------------------------------------------------+",
],
&result
);

// literals work as well
// to match against the entire input use ^ and $ in the regex
let df = ctx.sql("select regexp_match('John Smith', '^.*Smith$'), regexp_match('Smith Jones', '^Smith.*$')").await?;
let result = ctx
.sql("select regexp_match('John Smith', '^.*Smith$'), regexp_match('Smith Jones', '^Smith.*$')")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+----------------------------------------------------+-----------------------------------------------------+",
"| regexp_match(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_match(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
"+----------------------------------------------------+-----------------------------------------------------+",
"| [John Smith] | [Smith Jones] |",
"+----------------------------------------------------+-----------------------------------------------------+",
],
&result
);

//
//
//regexp_replace examples
//
//
// regexp_replace format is (regexp_replace(text, regex, replace, flags)
// regexp_replace format is (regexp_replace(text, regex, replace[, flags])
//

// use regexp_replace function against tables
let df = ctx
.sql("SELECT regexp_replace(values, patterns, replacement, flags) FROM examples")
let result = ctx
.sql("SELECT regexp_replace(values, patterns, replacement, concat('g', flags)) FROM examples")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+---------------------------------------------------------------------------------------------------------+",
"| regexp_replace(examples.values,examples.patterns,examples.replacement,concat(Utf8(\"g\"),examples.flags)) |",
"+---------------------------------------------------------------------------------------------------------+",
"| bbabbbc |",
"| B |",
"| aec |",
"| AbC |",
"| aBC |",
"| 4000 |",
"| xyz |",
"| München |",
"| Moscow |",
"| Koln |",
"| Today |",
"+---------------------------------------------------------------------------------------------------------+",
],
&result
);

// global flag example
let df = ctx
let result = ctx
.sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g')")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+------------------------------------------------------------------------+",
"| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
"+------------------------------------------------------------------------+",
"| fooXarYXazY |",
"+------------------------------------------------------------------------+",
],
&result
);

// without global flag
let df = ctx
.sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', null)")
let result = ctx
.sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y')")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+--------------------------------------------------------------+",
"| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\")) |",
"+--------------------------------------------------------------+",
"| fooXarYbaz |",
"+--------------------------------------------------------------+",
],
&result
);

// null regex means null result
let df = ctx
let result = ctx
.sql("SELECT regexp_replace('foobarbaz', NULL, 'X\\1Y', 'g')")
.await?
.collect()
.await?;

df.show().await?;
assert_batches_eq!(
&[
"+---------------------------------------------------------------+",
"| regexp_replace(Utf8(\"foobarbaz\"),NULL,Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
"+---------------------------------------------------------------+",
"| |",
"+---------------------------------------------------------------+",
],
&result
);

Ok(())
}
Loading