Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[SPARK-30763][SQL] Fix java.lang.IndexOutOfBoundsException No group 1…
… for regexp_extract

### What changes were proposed in this pull request?
The current implement of `regexp_extract` will throws a unprocessed exception show below:

`SELECT regexp_extract('1a 2b 14m', 'd+')`
```
java.lang.IndexOutOfBoundsException: No group 1
[info] at java.util.regex.Matcher.group(Matcher.java:538)
[info] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
[info] at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
[info] at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
```
I think should treat this exception well.

### Why are the changes needed?
Fix a bug `java.lang.IndexOutOfBoundsException No group 1 `

### Does this PR introduce any user-facing change?
Yes

### How was this patch tested?
New UT

Closes #27508 from beliefer/fix-regexp_extract-bug.

Authored-by: beliefer <beliefer@163.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
beliefer authored and cloud-fan committed Feb 12, 2020
1 parent b476999 commit f5026b1
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 1 deletion.
Expand Up @@ -410,6 +410,15 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
}
}

object RegExpExtract {
def checkGroupIndex(groupCount: Int, groupIndex: Int): Unit = {
if (groupCount < groupIndex) {
throw new IllegalArgumentException(
s"Regex group count is $groupCount, but the specified group index is $groupIndex")
}
}
}

/**
* Extract a specific(idx) group identified by a Java regex.
*
Expand Down Expand Up @@ -441,7 +450,9 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
val m = pattern.matcher(s.toString)
if (m.find) {
val mr: MatchResult = m.toMatchResult
val group = mr.group(r.asInstanceOf[Int])
val index = r.asInstanceOf[Int]
RegExpExtract.checkGroupIndex(mr.groupCount, index)
val group = mr.group(index)
if (group == null) { // Pattern matched, but not optional group
UTF8String.EMPTY_UTF8
} else {
Expand All @@ -459,6 +470,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val classNamePattern = classOf[Pattern].getCanonicalName
val classNameRegExpExtract = classOf[RegExpExtract].getCanonicalName
val matcher = ctx.freshName("matcher")
val matchResult = ctx.freshName("matchResult")

Expand All @@ -482,6 +494,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
$termPattern.matcher($subject.toString());
if ($matcher.find()) {
java.util.regex.MatchResult $matchResult = $matcher.toMatchResult();
$classNameRegExpExtract.checkGroupIndex($matchResult.groupCount(), $idx);
if ($matchResult.group($idx) == null) {
${ev.value} = UTF8String.EMPTY_UTF8;
} else {
Expand Down
Expand Up @@ -293,6 +293,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1))
checkEvaluation(nonNullExpr, "100", row1)

// invalid group index
val row8 = create_row("100-200", "(\\d+)-(\\d+)", 3)
val row9 = create_row("100-200", "(\\d+).*", 2)
val row10 = create_row("100-200", "\\d+", 1)

checkExceptionInExpression[IllegalArgumentException](
expr, row8, "Regex group count is 2, but the specified group index is 3")
checkExceptionInExpression[IllegalArgumentException](
expr, row9, "Regex group count is 1, but the specified group index is 2")
checkExceptionInExpression[IllegalArgumentException](
expr, row10, "Regex group count is 0, but the specified group index is 1")
}

test("SPLIT") {
Expand Down
@@ -0,0 +1,9 @@
-- regexp_extract
SELECT regexp_extract('1a 2b 14m', '\\d+');
SELECT regexp_extract('1a 2b 14m', '\\d+', 0);
SELECT regexp_extract('1a 2b 14m', '\\d+', 1);
SELECT regexp_extract('1a 2b 14m', '\\d+', 2);
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)');
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0);
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1);
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2);
@@ -0,0 +1,69 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 8


-- !query
SELECT regexp_extract('1a 2b 14m', '\\d+')
-- !query schema
struct<>
-- !query output
java.lang.IllegalArgumentException
Regex group count is 0, but the specified group index is 1


-- !query
SELECT regexp_extract('1a 2b 14m', '\\d+', 0)
-- !query schema
struct<regexp_extract(1a 2b 14m, \d+, 0):string>
-- !query output
1


-- !query
SELECT regexp_extract('1a 2b 14m', '\\d+', 1)
-- !query schema
struct<>
-- !query output
java.lang.IllegalArgumentException
Regex group count is 0, but the specified group index is 1


-- !query
SELECT regexp_extract('1a 2b 14m', '\\d+', 2)
-- !query schema
struct<>
-- !query output
java.lang.IllegalArgumentException
Regex group count is 0, but the specified group index is 2


-- !query
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)')
-- !query schema
struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
-- !query output
1


-- !query
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0)
-- !query schema
struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 0):string>
-- !query output
1a


-- !query
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1)
-- !query schema
struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
-- !query output
1


-- !query
SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2)
-- !query schema
struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 2):string>
-- !query output
a

0 comments on commit f5026b1

Please sign in to comment.