Skip to content

Commit

Permalink
[SPARK-32365][SQL] Add a boundary condition for negative index in reg…
Browse files Browse the repository at this point in the history
…exp_extract

### What changes were proposed in this pull request?
The current implement of regexp_extract will throws a unprocessed exception show below:
SELECT regexp_extract('1a 2b 14m', 'd+' -1)

```
java.lang.IndexOutOfBoundsException: No group -1
java.util.regex.Matcher.group(Matcher.java:538)
org.apache.spark.sql.catalyst.expressions.RegExpExtract.nullSafeEval(regexpExpressions.scala:455)
org.apache.spark.sql.catalyst.expressions.TernaryExpression.eval(Expression.scala:704)
org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:52)
org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$1$$anonfun$applyOrElse$1.applyOrElse(expressions.scala:45)
```

### Why are the changes needed?
Fix a bug `java.lang.IndexOutOfBoundsException: No group -1`

### Does this PR introduce _any_ user-facing change?
Yes

### How was this patch tested?
new UT

Closes #29161 from beliefer/regexp_extract-group-not-allow-less-than-zero.

Authored-by: gengjiaan <gengjiaan@360.cn>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
beliefer authored and dongjoon-hyun committed Jul 21, 2020
1 parent 7d65cae commit 02114f9
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,9 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio

object RegExpExtract {
def checkGroupIndex(groupCount: Int, groupIndex: Int): Unit = {
if (groupCount < groupIndex) {
if (groupIndex < 0) {
throw new IllegalArgumentException("The specified group index cannot be less than zero")
} else if (groupCount < groupIndex) {
throw new IllegalArgumentException(
s"Regex group count is $groupCount, but the specified group index is $groupIndex")
}
Expand All @@ -426,6 +428,14 @@ object RegExpExtract {
*/
@ExpressionDescription(
usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches `regexp`.",
arguments = """
Arguments:
* str - a string expression.
* regexp - a string representing a regular expression.
The regex string should be a Java regular expression.
* idx - an integer expression that representing the group index. The group index should be
non-negative. If `idx` is not specified, the default group index value is 1.
""",
examples = """
Examples:
> SELECT _FUNC_('100-200', '(\\d+)-(\\d+)', 1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,13 +303,19 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
val row8 = create_row("100-200", "(\\d+)-(\\d+)", 3)
val row9 = create_row("100-200", "(\\d+).*", 2)
val row10 = create_row("100-200", "\\d+", 1)
val row11 = create_row("100-200", "(\\d+)-(\\d+)", -1)
val row12 = create_row("100-200", "\\d+", -1)

checkExceptionInExpression[IllegalArgumentException](
expr, row8, "Regex group count is 2, but the specified group index is 3")
checkExceptionInExpression[IllegalArgumentException](
expr, row9, "Regex group count is 1, but the specified group index is 2")
checkExceptionInExpression[IllegalArgumentException](
expr, row10, "Regex group count is 0, but the specified group index is 1")
checkExceptionInExpression[IllegalArgumentException](
expr, row11, "The specified group index cannot be less than zero")
checkExceptionInExpression[IllegalArgumentException](
expr, row12, "The specified group index cannot be less than zero")

// Test escaping of arguments
GenerateUnsafeProjection.generate(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ SELECT regexp_extract('1a 2b 14m', '\\d+');
SELECT regexp_extract('1a 2b 14m', '\\d+', 0);
SELECT regexp_extract('1a 2b 14m', '\\d+', 1);
SELECT regexp_extract('1a 2b 14m', '\\d+', 2);
SELECT regexp_extract('1a 2b 14m', '\\d+', -1);
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)');
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0);
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1);
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2);
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', -1);
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 8
-- Number of queries: 10


-- !query
Expand Down Expand Up @@ -37,6 +37,15 @@ java.lang.IllegalArgumentException
Regex group count is 0, but the specified group index is 2


-- !query
SELECT regexp_extract('1a 2b 14m', '\\d+', -1)
-- !query schema
struct<>
-- !query output
java.lang.IllegalArgumentException
The specified group index cannot be less than zero


-- !query
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)')
-- !query schema
Expand Down Expand Up @@ -67,3 +76,12 @@ SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2)
struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 2):string>
-- !query output
a


-- !query
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', -1)
-- !query schema
struct<>
-- !query output
java.lang.IllegalArgumentException
The specified group index cannot be less than zero

0 comments on commit 02114f9

Please sign in to comment.