Skip to content

Commit

Permalink
[SPARK-36229][SQL] conv() inconsistently handles invalid strings with…
Browse files Browse the repository at this point in the history
… more than 64 invalid characters and return wrong value on overflow

### What changes were proposed in this pull request?
1/ conv() have inconsistency in behavior where the returned value is different above the 64 char threshold.

```
scala> spark.sql("select conv(repeat('?', 64), 10, 16)").show
+---------------------------+
|conv(repeat(?, 64), 10, 16)|
+---------------------------+
|                          0|
+---------------------------+

scala> spark.sql("select conv(repeat('?', 65), 10, 16)").show // which should be 0
+---------------------------+
|conv(repeat(?, 65), 10, 16)|
+---------------------------+
|           FFFFFFFFFFFFFFFF|
+---------------------------+

scala> spark.sql("select conv(repeat('?', 65), 10, -16)").show // which should be 0
+----------------------------+
|conv(repeat(?, 65), 10, -16)|
+----------------------------+
|                          -1|
+----------------------------+

scala> spark.sql("select conv(repeat('?', 64), 10, -16)").show
+----------------------------+
|conv(repeat(?, 64), 10, -16)|
+----------------------------+
|                           0|
+----------------------------+
```

2/ conv should return result equal to max unsigned long value in base toBase when there is overflow

```
scala> spark.sql(select conv('aaaaaaa0aaaaaaa0a', 16, 10)).show // which should be 18446744073709551615

+-------------------------------+
|conv(aaaaaaa0aaaaaaa0a, 16, 10)|
+-------------------------------+
|           12297828695278266890|
+-------------------------------+
```

### Why are the changes needed?
Bug fix, this pull request aim to make conv function behave similarly with the behavior of conv function from MySQL database
### Does this PR introduce _any_ user-facing change?
change in result of conv() function
### How was this patch tested?
add test

Closes #33459 from dgd-contributor/SPARK-36229_convInconsistencyBehaviorWithMoreThan64Characters.

Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
dchvn authored and cloud-fan committed Jul 28, 2021
1 parent af6d04b commit e1c50ff
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 15 deletions.
Expand Up @@ -49,11 +49,23 @@ object NumberConverter {
*/
private def encode(radix: Int, fromPos: Int, value: Array[Byte]): Long = {
var v: Long = 0L
val bound = java.lang.Long.divideUnsigned(-1 - radix, radix) // Possible overflow once
// bound will always be positive since radix >= 2
// Note that: -1 is equivalent to 11111111...1111 which is the largest unsigned long value
val bound = java.lang.Long.divideUnsigned(-1 - radix, radix)
var i = fromPos
while (i < value.length && value(i) >= 0) {
// if v < 0, which mean its bit presentation starts with 1, so v * radix will cause
// overflow since radix is greater than 2
if (v < 0) {
return -1
}
// check if v greater than bound
// if v is greater than bound, v * radix + radix will cause overflow.
if (v >= bound) {
// Check for overflow
// However our target is checking whether v * radix + value(i) can cause overflow or not.
// Because radix >= 2,so (-1 - value(i)) / radix will be positive (its bit presentation
// will start with 0) and we can easily checking for overflow by checking
// (-1 - value(i)) / radix < v or not
if (java.lang.Long.divideUnsigned(-1 - value(i), radix) < v) {
return -1
}
Expand All @@ -79,8 +91,8 @@ object NumberConverter {
}

/**
* Convert the chars in value[] to the corresponding integers. Convert invalid
* characters to -1.
* Convert the chars in value[] to the corresponding integers. If invalid
* character is found, convert it to -1 and ignore the suffix starting there.
*
* @param radix must be between MIN_RADIX and MAX_RADIX
* @param fromPos is the first nonzero element
Expand All @@ -89,6 +101,10 @@ object NumberConverter {
var i = fromPos
while (i < value.length) {
value(i) = Character.digit(value(i), radix).asInstanceOf[Byte]
// if invalid characters are found, it no need to convert the suffix starting there
if (value(i) == -1) {
return
}
i += 1
}
}
Expand All @@ -112,19 +128,15 @@ object NumberConverter {
var (negative, first) = if (n(0) == '-') (true, 1) else (false, 0)

// Copy the digits in the right side of the array
val temp = new Array[Byte](64)
val temp = new Array[Byte](Math.max(n.length, 64))
var v: Long = -1
if ((n.length == 65 && negative) || n.length <= 64) {
var i = 1
while (i <= n.length - first) {
temp(temp.length - i) = n(n.length - i)
i += 1
}
char2byte(fromBase, temp.length - n.length + first, temp)

// Do the conversion by going through a 64 bit integer
v = encode(fromBase, temp.length - n.length + first, temp)
}
System.arraycopy(n, first, temp, temp.length - n.length + first, n.length - first)
char2byte(fromBase, temp.length - n.length + first, temp)

// Do the conversion by going through a 64 bit integer
v = encode(fromBase, temp.length - n.length + first, temp)

if (negative && toBase > 0) {
if (v < 0) {
v = -1
Expand Down
Expand Up @@ -218,6 +218,20 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession {
checkAnswer(df.select(conv('num, 16, -10)), Row("-1"))
}

test("SPARK-36229 inconsistently behaviour where returned value is above the 64 char threshold") {
val df = Seq(("?" * 64), ("?" * 65), ("a" * 4 + "?" * 60), ("a" * 4 + "?" * 61)).toDF("num")
val expectedResult = Seq(Row("0"), Row("0"), Row("43690"), Row("43690"))
checkAnswer(df.select(conv('num, 16, 10)), expectedResult)
checkAnswer(df.select(conv('num, 16, -10)), expectedResult)
}

test("SPARK-36229 conv should return result equal to -1 in base of toBase") {
val df = Seq(("aaaaaaa0aaaaaaa0a"), ("aaaaaaa0aaaaaaa0")).toDF("num")
checkAnswer(df.select(conv('num, 16, 10)),
Seq(Row("18446744073709551615"), Row("12297829339523361440")))
checkAnswer(df.select(conv('num, 16, -10)), Seq(Row("-1"), Row("-6148914734186190176")))
}

test("floor") {
testOneToOneMathFunction(floor, (d: Double) => math.floor(d).toLong)
}
Expand Down

0 comments on commit e1c50ff

Please sign in to comment.