Skip to content

Commit

Permalink
[SPARK-31755][SQL] allow missing year/hour when parsing date/timestam…
Browse files Browse the repository at this point in the history
…p string

This PR allows missing hour fields when parsing date/timestamp string, with 0 as the default value.

If the year field is missing, this PR still fail the query by default, but provides a new legacy config to allow it and use 1970 as the default value. It's not a good default value, as it is not a leap year, which means that it would never parse Feb 29. We just pick it for backward compatibility.

To keep backward compatibility with Spark 2.4.

Yes.

Spark 2.4:
```
scala> sql("select to_timestamp('16', 'dd')").show
+------------------------+
|to_timestamp('16', 'dd')|
+------------------------+
|     1970-01-16 00:00:00|
+------------------------+

scala> sql("select to_date('16', 'dd')").show
+-------------------+
|to_date('16', 'dd')|
+-------------------+
|         1970-01-16|
+-------------------+

scala> sql("select to_timestamp('2019 40', 'yyyy mm')").show
+----------------------------------+
|to_timestamp('2019 40', 'yyyy mm')|
+----------------------------------+
|               2019-01-01 00:40:00|
+----------------------------------+

scala> sql("select to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')").show
+----------------------------------------------+
|to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')|
+----------------------------------------------+
|                           2019-01-01 10:10:10|
+----------------------------------------------+
```

in branch 3.0
```
scala> sql("select to_timestamp('16', 'dd')").show
+--------------------+
|to_timestamp(16, dd)|
+--------------------+
|                null|
+--------------------+

scala> sql("select to_date('16', 'dd')").show
+---------------+
|to_date(16, dd)|
+---------------+
|           null|
+---------------+

scala> sql("select to_timestamp('2019 40', 'yyyy mm')").show
+------------------------------+
|to_timestamp(2019 40, yyyy mm)|
+------------------------------+
|           2019-01-01 00:00:00|
+------------------------------+

scala> sql("select to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')").show
+------------------------------------------+
|to_timestamp(2019 10:10:10, yyyy hh:mm:ss)|
+------------------------------------------+
|                       2019-01-01 00:00:00|
+------------------------------------------+
```

After this PR, the behavior becomes the same as 2.4, if the legacy config is enabled.

new tests

Closes #28576 from cloud-fan/bug.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
  • Loading branch information
cloud-fan committed May 22, 2020
1 parent ec80e4b commit 763c351
Show file tree
Hide file tree
Showing 12 changed files with 370 additions and 92 deletions.
Expand Up @@ -53,7 +53,7 @@ class Iso8601DateFormatter(
val specialDate = convertSpecialDate(s.trim, zoneId)
specialDate.getOrElse {
try {
val localDate = LocalDate.parse(s, formatter)
val localDate = toLocalDate(formatter.parse(s))
localDateToDays(localDate)
} catch checkDiffResult(s, legacyFormatter.parse)
}
Expand Down
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util

import java.time._
import java.time.chrono.IsoChronology
import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, DateTimeParseException, ResolverStyle}
import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle}
import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries}
import java.util.Locale

Expand All @@ -31,17 +31,52 @@ import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._

trait DateTimeFormatterHelper {
private def getOrDefault(accessor: TemporalAccessor, field: ChronoField, default: Int): Int = {
if (accessor.isSupported(field)) {
accessor.get(field)
} else {
default
}
}

protected def toLocalDate(accessor: TemporalAccessor): LocalDate = {
val localDate = accessor.query(TemporalQueries.localDate())
// If all the date fields are specified, return the local date directly.
if (localDate != null) return localDate

// Users may want to parse only a few datetime fields from a string and extract these fields
// later, and we should provide default values for missing fields.
// To be compatible with Spark 2.4, we pick 1970 as the default value of year.
val year = getOrDefault(accessor, ChronoField.YEAR, 1970)
val month = getOrDefault(accessor, ChronoField.MONTH_OF_YEAR, 1)
val day = getOrDefault(accessor, ChronoField.DAY_OF_MONTH, 1)
LocalDate.of(year, month, day)
}

private def toLocalTime(accessor: TemporalAccessor): LocalTime = {
val localTime = accessor.query(TemporalQueries.localTime())
// If all the time fields are specified, return the local time directly.
if (localTime != null) return localTime

val hour = if (accessor.isSupported(ChronoField.HOUR_OF_DAY)) {
accessor.get(ChronoField.HOUR_OF_DAY)
} else if (accessor.isSupported(ChronoField.HOUR_OF_AMPM)) {
// When we reach here, it means am/pm is not specified. Here we assume it's am.
accessor.get(ChronoField.HOUR_OF_AMPM)
} else {
0
}
val minute = getOrDefault(accessor, ChronoField.MINUTE_OF_HOUR, 0)
val second = getOrDefault(accessor, ChronoField.SECOND_OF_MINUTE, 0)
val nanoSecond = getOrDefault(accessor, ChronoField.NANO_OF_SECOND, 0)
LocalTime.of(hour, minute, second, nanoSecond)
}

// Converts the parsed temporal object to ZonedDateTime. It sets time components to zeros
// if they does not exist in the parsed object.
protected def toZonedDateTime(
temporalAccessor: TemporalAccessor,
zoneId: ZoneId): ZonedDateTime = {
// Parsed input might not have time related part. In that case, time component is set to zeros.
val parsedLocalTime = temporalAccessor.query(TemporalQueries.localTime)
val localTime = if (parsedLocalTime == null) LocalTime.MIDNIGHT else parsedLocalTime
// Parsed input must have date component. At least, year must present in temporalAccessor.
val localDate = temporalAccessor.query(TemporalQueries.localDate)

protected def toZonedDateTime(accessor: TemporalAccessor, zoneId: ZoneId): ZonedDateTime = {
val localDate = toLocalDate(accessor)
val localTime = toLocalTime(accessor)
ZonedDateTime.of(localDate, localTime, zoneId)
}

Expand Down Expand Up @@ -72,19 +107,15 @@ trait DateTimeFormatterHelper {
// DateTimeParseException will address by the caller side.
protected def checkDiffResult[T](
s: String, legacyParseFunc: String => T): PartialFunction[Throwable, T] = {
case e: DateTimeParseException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION =>
val res = try {
Some(legacyParseFunc(s))
case e: DateTimeException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION =>
try {
legacyParseFunc(s)
} catch {
case _: Throwable => None
}
if (res.nonEmpty) {
throw new SparkUpgradeException("3.0", s"Fail to parse '$s' in the new parser. You can " +
s"set ${SQLConf.LEGACY_TIME_PARSER_POLICY.key} to LEGACY to restore the behavior " +
s"before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.", e)
} else {
throw e
case _: Throwable => throw e
}
throw new SparkUpgradeException("3.0", s"Fail to parse '$s' in the new parser. You can " +
s"set ${SQLConf.LEGACY_TIME_PARSER_POLICY.key} to LEGACY to restore the behavior " +
s"before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.", e)
}
}

Expand All @@ -101,10 +132,6 @@ private object DateTimeFormatterHelper {

def toFormatter(builder: DateTimeFormatterBuilder, locale: Locale): DateTimeFormatter = {
builder
.parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
.parseDefaulting(ChronoField.DAY_OF_MONTH, 1)
.parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0)
.parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0)
.toFormatter(locale)
.withChronology(IsoChronology.INSTANCE)
.withResolverStyle(ResolverStyle.STRICT)
Expand Down
Expand Up @@ -325,30 +325,30 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45") ==
date(2020, 1, 12, 12, 3, 45, 0))
assert(parser.makeConverter("t", DateType).apply("2020-1-12") ==
days(2020, 1, 12, 0, 0, 0))
days(2020, 1, 12))
// The legacy format allows arbitrary length of second fraction.
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45.1") ==
date(2020, 1, 12, 12, 3, 45, 100000))
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45.1234") ==
date(2020, 1, 12, 12, 3, 45, 123400))
// The legacy format allow date string to end with T or space, with arbitrary string
assert(parser.makeConverter("t", DateType).apply("2020-1-12T") ==
days(2020, 1, 12, 0, 0, 0))
days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("2020-1-12Txyz") ==
days(2020, 1, 12, 0, 0, 0))
days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("2020-1-12 ") ==
days(2020, 1, 12, 0, 0, 0))
days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("2020-1-12 xyz") ==
days(2020, 1, 12, 0, 0, 0))
days(2020, 1, 12))
// The legacy format ignores the "GMT" from the string
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45GMT") ==
date(2020, 1, 12, 12, 3, 45, 0))
assert(parser.makeConverter("t", TimestampType).apply("GMT2020-1-12 12:3:45") ==
date(2020, 1, 12, 12, 3, 45, 0))
assert(parser.makeConverter("t", DateType).apply("2020-1-12GMT") ==
days(2020, 1, 12, 0, 0, 0))
days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("GMT2020-1-12") ==
days(2020, 1, 12, 0, 0, 0))
days(2020, 1, 12))
}

val options = new CSVOptions(Map.empty[String, String], false, "UTC")
Expand Down
Expand Up @@ -88,12 +88,8 @@ object DateTimeTestUtils {
def days(
year: Int,
month: Byte = 1,
day: Byte = 1,
hour: Byte = 0,
minute: Byte = 0,
sec: Byte = 0): Int = {
val micros = date(year, month, day, hour, minute, sec)
TimeUnit.MICROSECONDS.toDays(micros).toInt
day: Byte = 1): Int = {
LocalDate.of(year, month, day).toEpochDay.toInt
}

// Returns microseconds since epoch for current date and give time
Expand Down
Expand Up @@ -386,13 +386,13 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
}

test("date add months") {
val input = days(1997, 2, 28, 10, 30)
val input = days(1997, 2, 28)
assert(dateAddMonths(input, 36) === days(2000, 2, 28))
assert(dateAddMonths(input, -13) === days(1996, 1, 28))
}

test("date add interval with day precision") {
val input = days(1997, 2, 28, 10, 30)
val input = days(1997, 2, 28)
assert(dateAddInterval(input, new CalendarInterval(36, 0, 0)) === days(2000, 2, 28))
assert(dateAddInterval(input, new CalendarInterval(36, 47, 0)) === days(2000, 4, 15))
assert(dateAddInterval(input, new CalendarInterval(-13, 0, 0)) === days(1996, 1, 28))
Expand Down
Expand Up @@ -17,18 +17,19 @@

package org.apache.spark.sql.util

import java.time.{DateTimeException, LocalDate, ZoneOffset}
import java.time.{DateTimeException, LocalDate}

import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyDateFormats}
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy

class DateFormatterSuite extends SparkFunSuite with SQLHelper {
test("parsing dates") {
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(getZoneId(timeZone))
val daysSinceEpoch = formatter.parse("2018-12-02")
Expand All @@ -38,7 +39,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}

test("format dates") {
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(getZoneId(timeZone))
val (days, expected) = (17867, "2018-12-02")
Expand All @@ -65,7 +66,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
"2018-12-12",
"2038-01-01",
"5010-11-17").foreach { date =>
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(
DateFormatter.defaultPattern,
Expand Down Expand Up @@ -99,7 +100,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
17877,
24837,
1110657).foreach { days =>
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(
DateFormatter.defaultPattern,
Expand All @@ -118,14 +119,14 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}

test("parsing date without explicit day") {
val formatter = DateFormatter("yyyy MMM", ZoneOffset.UTC)
val formatter = DateFormatter("yyyy MMM", UTC)
val daysSinceEpoch = formatter.parse("2018 Dec")
assert(daysSinceEpoch === LocalDate.of(2018, 12, 1).toEpochDay)
assert(daysSinceEpoch === days(2018, 12, 1))
}

test("formatting negative years with default pattern") {
val epochDays = LocalDate.of(-99, 1, 1).toEpochDay.toInt
assert(DateFormatter(ZoneOffset.UTC).format(epochDays) === "-0099-01-01")
val epochDays = days(-99, 1, 1)
assert(DateFormatter(UTC).format(epochDays) === "-0099-01-01")
}

test("special date values") {
Expand All @@ -142,8 +143,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}

test("SPARK-30958: parse date with negative year") {
val formatter1 = DateFormatter("yyyy-MM-dd", ZoneOffset.UTC)
assert(formatter1.parse("-1234-02-22") === localDateToDays(LocalDate.of(-1234, 2, 22)))
val formatter1 = DateFormatter("yyyy-MM-dd", UTC)
assert(formatter1.parse("-1234-02-22") === days(-1234, 2, 22))

def assertParsingError(f: => Unit): Unit = {
intercept[Exception](f) match {
Expand All @@ -155,18 +156,18 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}

// "yyyy" with "G" can't parse negative year or year 0000.
val formatter2 = DateFormatter("G yyyy-MM-dd", ZoneOffset.UTC)
val formatter2 = DateFormatter("G yyyy-MM-dd", UTC)
assertParsingError(formatter2.parse("BC -1234-02-22"))
assertParsingError(formatter2.parse("AD 0000-02-22"))

assert(formatter2.parse("BC 1234-02-22") === localDateToDays(LocalDate.of(-1233, 2, 22)))
assert(formatter2.parse("AD 1234-02-22") === localDateToDays(LocalDate.of(1234, 2, 22)))
assert(formatter2.parse("BC 1234-02-22") === days(-1233, 2, 22))
assert(formatter2.parse("AD 1234-02-22") === days(1234, 2, 22))
}

test("SPARK-31557: rebasing in legacy formatters/parsers") {
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> LegacyBehaviorPolicy.LEGACY.toString) {
LegacyDateFormats.values.foreach { legacyFormat =>
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(
DateFormatter.defaultPattern,
Expand All @@ -182,4 +183,17 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}
}
}

test("missing date fields") {
val formatter = DateFormatter("HH", UTC)
val daysSinceEpoch = formatter.parse("20")
assert(daysSinceEpoch === days(1970, 1, 1))
}

test("missing year field with invalid date") {
val formatter = DateFormatter("MM-dd", UTC)
// The date parser in 2.4 accepts 1970-02-29 and turn it into 1970-03-01, so we should get a
// SparkUpgradeException here.
intercept[SparkUpgradeException](formatter.parse("02-29"))
}
}

0 comments on commit 763c351

Please sign in to comment.