From 0b93961053439db7ebc175ee456ba9a7694f4811 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 8 Apr 2026 17:02:32 -0400 Subject: [PATCH 1/2] Add more regexp_replace test coverage --- .../test_files/regexp/regexp_replace.slt | 141 +++++++++++++++--- 1 file changed, 124 insertions(+), 17 deletions(-) diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt index a2eccfce5f69..087ad379a2eb 100644 --- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt +++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt @@ -128,29 +128,136 @@ from (values ('a'), ('b')) as tbl(col); NULL NULL NULL NULL NULL NULL +# Fixture for testing optimizations in regexp_replace +statement ok +CREATE TABLE regexp_replace_optimized_cases ( + value string, + regexp string, + replacement string, + expected string +); + +# Extract domain from URL using anchored pattern with trailing .* +# This tests that the full URL suffix is replaced, not just the matched prefix. +statement ok +INSERT INTO regexp_replace_optimized_cases VALUES + ('https://www.example.com/path/to/page?q=1', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'), + ('http://test.org/foo/bar', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'test.org'), + ('https://example.com/', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'example.com'), + ('not-a-url', '^https?://(?:www\.)?([^/]+)/.*$', '\1', 'not-a-url'); + +# More than one capture group should disable the short-regex fast path. +# This still uses replacement \1, but captures_len() will be > 2, so the +# implementation must fall back to the normal regexp_replace path. +statement ok +INSERT INTO regexp_replace_optimized_cases VALUES + ('https://www.example.com/path/to/page?q=1', '^https?://((www\.)?([^/]+))/.*$', '\1', 'www.example.com'), + ('http://test.org/foo/bar', '^https?://((www\.)?([^/]+))/.*$', '\1', 'test.org'), + ('not-a-url', '^https?://((www\.)?([^/]+))/.*$', '\1', 'not-a-url'); + # If the overall pattern matches but capture group 1 does not participate, # regexp_replace(..., '\1') should substitute the empty string, not keep # the original input. -query B -SELECT regexp_replace('bzzz', '^(a)?b.*$', '\1') = ''; ----- -true +statement ok +INSERT INTO regexp_replace_optimized_cases VALUES + ('bzzz', '^(a)?b.*$', '\1', ''); # Stripping trailing .*$ must not change match semantics for inputs with # newlines when the original pattern does not use the 's' flag. -query B -SELECT regexp_replace(concat('http://x/', chr(10), 'rest'), '^https?://([^/]+)/.*$', '\1') - = concat('http://x/', chr(10), 'rest'); ----- -true +statement ok +INSERT INTO regexp_replace_optimized_cases VALUES + (concat('http://x/', chr(10), 'rest'), '^https?://([^/]+)/.*$', '\1', concat('http://x/', chr(10), 'rest')); # Inline multiline mode still allows only the matched prefix to be replaced. # The remainder of the string must be preserved. -query B -SELECT regexp_replace( - concat('http://x/path', chr(10), 'rest'), - '^(?m)https?://([^/]+)/.*$', - '\1' - ) = concat('x', chr(10), 'rest'); ----- -true +statement ok +INSERT INTO regexp_replace_optimized_cases VALUES + (concat('http://x/path', chr(10), 'rest'), '^(?m)https?://([^/]+)/.*$', '\1', concat('x', chr(10), 'rest')); + +query TB +SELECT value, regexp_replace(value, regexp, replacement) = expected +FROM regexp_replace_optimized_cases +ORDER BY regexp, value, replacement, expected; +---- +http://x/path +rest true +bzzz true +http://test.org/foo/bar true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://test.org/foo/bar true +https://example.com/ true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://x/ +rest true + +query TB +SELECT value, regexp_replace( + arrow_cast(value, 'LargeUtf8'), + arrow_cast(regexp, 'LargeUtf8'), + arrow_cast(replacement, 'LargeUtf8') + ) = arrow_cast(expected, 'LargeUtf8') +FROM regexp_replace_optimized_cases +ORDER BY regexp, value, replacement, expected; +---- +http://x/path +rest true +bzzz true +http://test.org/foo/bar true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://test.org/foo/bar true +https://example.com/ true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://x/ +rest true + +query TB +SELECT value, regexp_replace( + arrow_cast(value, 'Utf8View'), + arrow_cast(regexp, 'Utf8View'), + arrow_cast(replacement, 'Utf8View') + ) = arrow_cast(expected, 'Utf8View') +FROM regexp_replace_optimized_cases +ORDER BY regexp, value, replacement, expected; +---- +http://x/path +rest true +bzzz true +http://test.org/foo/bar true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://test.org/foo/bar true +https://example.com/ true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://x/ +rest true + +query TB +SELECT value, regexp_replace( + arrow_cast(value, 'Dictionary(Int32, Utf8)'), + arrow_cast(regexp, 'Dictionary(Int32, Utf8)'), + arrow_cast(replacement, 'Dictionary(Int32, Utf8)') + ) = arrow_cast(expected, 'Dictionary(Int32, Utf8)') +FROM regexp_replace_optimized_cases +ORDER BY regexp, value, replacement, expected; +---- +http://x/path +rest true +bzzz true +http://test.org/foo/bar true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://test.org/foo/bar true +https://example.com/ true +https://www.example.com/path/to/page?q=1 true +not-a-url true +http://x/ +rest true + +# cleanup +statement ok +DROP TABLE regexp_replace_optimized_cases; From d2b8b01099a162042624424526121dc8db3d5a71 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 9 Apr 2026 07:12:45 -0400 Subject: [PATCH 2/2] Keep multi-line tests in own fixture --- .../test_files/regexp/regexp_replace.slt | 48 ++++++++----------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt index 087ad379a2eb..e3200e334cdf 100644 --- a/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt +++ b/datafusion/sqllogictest/test_files/regexp/regexp_replace.slt @@ -128,6 +128,27 @@ from (values ('a'), ('b')) as tbl(col); NULL NULL NULL NULL NULL NULL + +# Stripping trailing .*$ must not change match semantics for inputs with +# newlines when the original pattern does not use the 's' flag. +query B +SELECT regexp_replace(concat('http://x/', chr(10), 'rest'), '^https?://([^/]+)/.*$', '\1') + = concat('http://x/', chr(10), 'rest'); +---- +true + +# Inline multiline mode still allows only the matched prefix to be replaced. +# The remainder of the string must be preserved. +query B +SELECT regexp_replace( + concat('http://x/path', chr(10), 'rest'), + '^(?m)https?://([^/]+)/.*$', + '\1' + ) = concat('x', chr(10), 'rest'); +---- +true + + # Fixture for testing optimizations in regexp_replace statement ok CREATE TABLE regexp_replace_optimized_cases ( @@ -162,25 +183,12 @@ statement ok INSERT INTO regexp_replace_optimized_cases VALUES ('bzzz', '^(a)?b.*$', '\1', ''); -# Stripping trailing .*$ must not change match semantics for inputs with -# newlines when the original pattern does not use the 's' flag. -statement ok -INSERT INTO regexp_replace_optimized_cases VALUES - (concat('http://x/', chr(10), 'rest'), '^https?://([^/]+)/.*$', '\1', concat('http://x/', chr(10), 'rest')); - -# Inline multiline mode still allows only the matched prefix to be replaced. -# The remainder of the string must be preserved. -statement ok -INSERT INTO regexp_replace_optimized_cases VALUES - (concat('http://x/path', chr(10), 'rest'), '^(?m)https?://([^/]+)/.*$', '\1', concat('x', chr(10), 'rest')); query TB SELECT value, regexp_replace(value, regexp, replacement) = expected FROM regexp_replace_optimized_cases ORDER BY regexp, value, replacement, expected; ---- -http://x/path -rest true bzzz true http://test.org/foo/bar true https://www.example.com/path/to/page?q=1 true @@ -189,8 +197,6 @@ http://test.org/foo/bar true https://example.com/ true https://www.example.com/path/to/page?q=1 true not-a-url true -http://x/ -rest true query TB SELECT value, regexp_replace( @@ -201,8 +207,6 @@ SELECT value, regexp_replace( FROM regexp_replace_optimized_cases ORDER BY regexp, value, replacement, expected; ---- -http://x/path -rest true bzzz true http://test.org/foo/bar true https://www.example.com/path/to/page?q=1 true @@ -211,8 +215,6 @@ http://test.org/foo/bar true https://example.com/ true https://www.example.com/path/to/page?q=1 true not-a-url true -http://x/ -rest true query TB SELECT value, regexp_replace( @@ -223,8 +225,6 @@ SELECT value, regexp_replace( FROM regexp_replace_optimized_cases ORDER BY regexp, value, replacement, expected; ---- -http://x/path -rest true bzzz true http://test.org/foo/bar true https://www.example.com/path/to/page?q=1 true @@ -233,8 +233,6 @@ http://test.org/foo/bar true https://example.com/ true https://www.example.com/path/to/page?q=1 true not-a-url true -http://x/ -rest true query TB SELECT value, regexp_replace( @@ -245,8 +243,6 @@ SELECT value, regexp_replace( FROM regexp_replace_optimized_cases ORDER BY regexp, value, replacement, expected; ---- -http://x/path -rest true bzzz true http://test.org/foo/bar true https://www.example.com/path/to/page?q=1 true @@ -255,8 +251,6 @@ http://test.org/foo/bar true https://example.com/ true https://www.example.com/path/to/page?q=1 true not-a-url true -http://x/ -rest true # cleanup statement ok