Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-29822][SQL] Fix cast error when there are white spaces between signs and values #26449

Closed
wants to merge 14 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -425,14 +425,18 @@ object IntervalUtils {
}

private object ParseState extends Enumeration {
type ParseState = Value

val PREFIX,
BEGIN_VALUE,
PARSE_SIGN,
PARSE_UNIT_VALUE,
FRACTIONAL_PART,
BEGIN_UNIT_NAME,
UNIT_NAME_SUFFIX,
END_UNIT_NAME = Value
TRIM_BEFORE_SIGN,
SIGN,
TRIM_BEFORE_VALUE,
VALUE,
VALUE_FRACTIONAL_PART,
TRIM_BEFORE_UNIT,
UNIT_BEGIN,
UNIT_SUFFIX,
UNIT_END = Value
}
private final val intervalStr = UTF8String.fromString("interval ")
private def unitToUtf8(unit: IntervalUnit): UTF8String = {
Expand All @@ -458,7 +462,7 @@ object IntervalUtils {
val s = input.trim.toLowerCase
// scalastyle:on
val bytes = s.getBytes
if (bytes.length == 0) {
if (bytes.isEmpty) {
return null
}
var state = PREFIX
Expand All @@ -471,6 +475,13 @@ object IntervalUtils {
var fractionScale: Int = 0
var fraction: Int = 0

def trimToNextState(b: Byte, next: ParseState): Unit = {
b match {
case ' ' => i += 1
case _ => state = next
}
}

while (i < bytes.length) {
val b = bytes(i)
state match {
Expand All @@ -482,13 +493,9 @@ object IntervalUtils {
i += intervalStr.numBytes()
}
}
state = BEGIN_VALUE
case BEGIN_VALUE =>
b match {
case ' ' => i += 1
case _ => state = PARSE_SIGN
}
case PARSE_SIGN =>
state = TRIM_BEFORE_SIGN
case TRIM_BEFORE_SIGN => trimToNextState(b, SIGN)
case SIGN =>
b match {
case '-' =>
isNegative = true
Expand All @@ -505,111 +512,108 @@ object IntervalUtils {
// Sets the scale to an invalid value to track fraction presence
// in the BEGIN_UNIT_NAME state
fractionScale = -1
state = PARSE_UNIT_VALUE
case PARSE_UNIT_VALUE =>
state = TRIM_BEFORE_VALUE
case TRIM_BEFORE_VALUE => trimToNextState(b, VALUE)
case VALUE =>
b match {
case _ if '0' <= b && b <= '9' =>
try {
currentValue = Math.addExact(Math.multiplyExact(10, currentValue), (b - '0'))
} catch {
case _: ArithmeticException => return null
}
case ' ' =>
state = BEGIN_UNIT_NAME
case ' ' => state = TRIM_BEFORE_UNIT
case '.' =>
fractionScale = (NANOS_PER_SECOND / 10).toInt
state = FRACTIONAL_PART
state = VALUE_FRACTIONAL_PART
case _ => return null
}
i += 1
case FRACTIONAL_PART =>
case VALUE_FRACTIONAL_PART =>
b match {
case _ if '0' <= b && b <= '9' && fractionScale > 0 =>
fraction += (b - '0') * fractionScale
fractionScale /= 10
case ' ' =>
fraction /= NANOS_PER_MICROS.toInt
state = BEGIN_UNIT_NAME
state = TRIM_BEFORE_UNIT
case _ => return null
}
i += 1
case BEGIN_UNIT_NAME =>
if (b == ' ') {
i += 1
} else {
// Checks that only seconds can have the fractional part
if (b != 's' && fractionScale >= 0) {
return null
}
if (isNegative) {
currentValue = -currentValue
fraction = -fraction
}
try {
b match {
case 'y' if s.matchAt(yearStr, i) =>
val monthsInYears = Math.multiplyExact(MONTHS_PER_YEAR, currentValue)
months = Math.toIntExact(Math.addExact(months, monthsInYears))
i += yearStr.numBytes()
case 'w' if s.matchAt(weekStr, i) =>
val daysInWeeks = Math.multiplyExact(DAYS_PER_WEEK, currentValue)
days = Math.toIntExact(Math.addExact(days, daysInWeeks))
i += weekStr.numBytes()
case 'd' if s.matchAt(dayStr, i) =>
days = Math.addExact(days, Math.toIntExact(currentValue))
i += dayStr.numBytes()
case 'h' if s.matchAt(hourStr, i) =>
val hoursUs = Math.multiplyExact(currentValue, MICROS_PER_HOUR)
microseconds = Math.addExact(microseconds, hoursUs)
i += hourStr.numBytes()
case 's' if s.matchAt(secondStr, i) =>
val secondsUs = Math.multiplyExact(currentValue, MICROS_PER_SECOND)
microseconds = Math.addExact(Math.addExact(microseconds, secondsUs), fraction)
i += secondStr.numBytes()
case 'm' =>
if (s.matchAt(monthStr, i)) {
months = Math.addExact(months, Math.toIntExact(currentValue))
i += monthStr.numBytes()
} else if (s.matchAt(minuteStr, i)) {
val minutesUs = Math.multiplyExact(currentValue, MICROS_PER_MINUTE)
microseconds = Math.addExact(microseconds, minutesUs)
i += minuteStr.numBytes()
} else if (s.matchAt(millisStr, i)) {
val millisUs = Math.multiplyExact(
currentValue,
MICROS_PER_MILLIS)
microseconds = Math.addExact(microseconds, millisUs)
i += millisStr.numBytes()
} else if (s.matchAt(microsStr, i)) {
microseconds = Math.addExact(microseconds, currentValue)
i += microsStr.numBytes()
} else return null
case _ => return null
}
} catch {
case _: ArithmeticException => return null
case TRIM_BEFORE_UNIT => trimToNextState(b, UNIT_BEGIN)
case UNIT_BEGIN =>
// Checks that only seconds can have the fractional part
if (b != 's' && fractionScale >= 0) {
return null
}
if (isNegative) {
currentValue = -currentValue
fraction = -fraction
}
try {
b match {
case 'y' if s.matchAt(yearStr, i) =>
val monthsInYears = Math.multiplyExact(MONTHS_PER_YEAR, currentValue)
months = Math.toIntExact(Math.addExact(months, monthsInYears))
i += yearStr.numBytes()
case 'w' if s.matchAt(weekStr, i) =>
val daysInWeeks = Math.multiplyExact(DAYS_PER_WEEK, currentValue)
days = Math.toIntExact(Math.addExact(days, daysInWeeks))
i += weekStr.numBytes()
case 'd' if s.matchAt(dayStr, i) =>
days = Math.addExact(days, Math.toIntExact(currentValue))
i += dayStr.numBytes()
case 'h' if s.matchAt(hourStr, i) =>
val hoursUs = Math.multiplyExact(currentValue, MICROS_PER_HOUR)
microseconds = Math.addExact(microseconds, hoursUs)
i += hourStr.numBytes()
case 's' if s.matchAt(secondStr, i) =>
val secondsUs = Math.multiplyExact(currentValue, MICROS_PER_SECOND)
microseconds = Math.addExact(Math.addExact(microseconds, secondsUs), fraction)
i += secondStr.numBytes()
case 'm' =>
if (s.matchAt(monthStr, i)) {
months = Math.addExact(months, Math.toIntExact(currentValue))
i += monthStr.numBytes()
} else if (s.matchAt(minuteStr, i)) {
val minutesUs = Math.multiplyExact(currentValue, MICROS_PER_MINUTE)
microseconds = Math.addExact(microseconds, minutesUs)
i += minuteStr.numBytes()
} else if (s.matchAt(millisStr, i)) {
val millisUs = Math.multiplyExact(
currentValue,
MICROS_PER_MILLIS)
microseconds = Math.addExact(microseconds, millisUs)
i += millisStr.numBytes()
} else if (s.matchAt(microsStr, i)) {
microseconds = Math.addExact(microseconds, currentValue)
i += microsStr.numBytes()
} else return null
case _ => return null
}
state = UNIT_NAME_SUFFIX
} catch {
case _: ArithmeticException => return null
}
case UNIT_NAME_SUFFIX =>
state = UNIT_SUFFIX
case UNIT_SUFFIX =>
b match {
case 's' => state = END_UNIT_NAME
case ' ' => state = BEGIN_VALUE
case 's' => state = UNIT_END
case ' ' => state = TRIM_BEFORE_SIGN
case _ => return null
}
i += 1
case END_UNIT_NAME =>
case UNIT_END =>
b match {
case ' ' =>
i += 1
state = BEGIN_VALUE
state = TRIM_BEFORE_SIGN
case _ => return null
}
}
}

val result = state match {
case UNIT_NAME_SUFFIX | END_UNIT_NAME | BEGIN_VALUE =>
case UNIT_SUFFIX | UNIT_END | TRIM_BEFORE_SIGN =>
new CalendarInterval(months, days, microseconds)
case _ => null
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class IntervalUtilsSuite extends SparkFunSuite {
"-1 MONTH 1 day -1 microseconds" -> new CalendarInterval(-1, 1, -1),
" 123 MONTHS 123 DAYS 123 Microsecond " -> new CalendarInterval(123, 123, 123),
"interval -1 day +3 Microseconds" -> new CalendarInterval(0, -1, 3),
"interval - 1 day + 3 Microseconds" -> new CalendarInterval(0, -1, 3),
" interval 8 years -11 months 123 weeks -1 day " +
"23 hours -22 minutes 1 second -123 millisecond 567 microseconds " ->
new CalendarInterval(85, 860, 81480877567L)).foreach { case (input, expected) =>
Expand Down
48 changes: 26 additions & 22 deletions sql/core/benchmarks/IntervalBenchmark-jdk11-results.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
OpenJDK 64-Bit Server VM 11.0.2+9 on Mac OS X 10.15.1
Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.14.6
Intel(R) Core(TM) i5-5287U CPU @ 2.90GHz
cast strings to intervals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
prepare string w/ interval 442 472 41 2.3 442.4 1.0X
prepare string w/o interval 420 423 6 2.4 419.6 1.1X
1 units w/ interval 350 359 9 2.9 349.8 1.3X
1 units w/o interval 316 317 1 3.2 316.4 1.4X
2 units w/ interval 457 459 2 2.2 457.0 1.0X
2 units w/o interval 432 435 3 2.3 432.2 1.0X
3 units w/ interval 610 613 3 1.6 609.8 0.7X
3 units w/o interval 581 583 2 1.7 580.5 0.8X
4 units w/ interval 720 724 4 1.4 720.4 0.6X
4 units w/o interval 699 704 8 1.4 699.4 0.6X
5 units w/ interval 850 850 0 1.2 849.9 0.5X
5 units w/o interval 829 832 5 1.2 828.7 0.5X
6 units w/ interval 927 932 4 1.1 927.1 0.5X
6 units w/o interval 891 892 1 1.1 890.5 0.5X
7 units w/ interval 1033 1040 8 1.0 1033.2 0.4X
7 units w/o interval 1020 1024 5 1.0 1020.2 0.4X
8 units w/ interval 1168 1169 2 0.9 1168.0 0.4X
8 units w/o interval 1155 1157 2 0.9 1154.5 0.4X
9 units w/ interval 1326 1328 3 0.8 1326.1 0.3X
9 units w/o interval 1372 1381 14 0.7 1372.5 0.3X
prepare string w/ interval 574 610 45 1.7 573.9 1.0X
prepare string w/o interval 518 538 27 1.9 517.7 1.1X
1 units w/ interval 425 439 16 2.4 425.3 1.3X
1 units w/o interval 385 393 10 2.6 385.2 1.5X
2 units w/ interval 553 561 11 1.8 553.1 1.0X
2 units w/o interval 531 543 11 1.9 531.0 1.1X
3 units w/ interval 1134 1159 32 0.9 1134.0 0.5X
3 units w/o interval 1121 1126 6 0.9 1121.3 0.5X
4 units w/ interval 1226 1250 21 0.8 1226.1 0.5X
4 units w/o interval 1227 1239 11 0.8 1227.1 0.5X
5 units w/ interval 1375 1447 93 0.7 1374.7 0.4X
5 units w/o interval 1335 1346 19 0.7 1335.1 0.4X
6 units w/ interval 1530 1556 24 0.7 1529.5 0.4X
6 units w/o interval 1481 1492 17 0.7 1480.7 0.4X
7 units w/ interval 1730 1745 14 0.6 1729.9 0.3X
7 units w/o interval 1788 1859 112 0.6 1788.1 0.3X
8 units w/ interval 1952 2087 117 0.5 1951.7 0.3X
8 units w/o interval 2083 2207 209 0.5 2082.5 0.3X
9 units w/ interval 2228 2291 60 0.4 2227.5 0.3X
9 units w/o interval 2130 2184 75 0.5 2130.1 0.3X
10 units w/ interval 2414 2502 81 0.4 2413.8 0.2X
10 units w/o interval 2463 2488 35 0.4 2463.1 0.2X
11 units w/ interval 2717 2755 42 0.4 2716.8 0.2X
11 units w/o interval 2578 2661 77 0.4 2577.7 0.2X

Loading