From 763c35145e50da553094135e27c7092fac5019e3 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Fri, 22 May 2020 16:10:08 +0900 Subject: [PATCH] [SPARK-31755][SQL] allow missing year/hour when parsing date/timestamp string This PR allows missing hour fields when parsing date/timestamp string, with 0 as the default value. If the year field is missing, this PR still fail the query by default, but provides a new legacy config to allow it and use 1970 as the default value. It's not a good default value, as it is not a leap year, which means that it would never parse Feb 29. We just pick it for backward compatibility. To keep backward compatibility with Spark 2.4. Yes. Spark 2.4: ``` scala> sql("select to_timestamp('16', 'dd')").show +------------------------+ |to_timestamp('16', 'dd')| +------------------------+ | 1970-01-16 00:00:00| +------------------------+ scala> sql("select to_date('16', 'dd')").show +-------------------+ |to_date('16', 'dd')| +-------------------+ | 1970-01-16| +-------------------+ scala> sql("select to_timestamp('2019 40', 'yyyy mm')").show +----------------------------------+ |to_timestamp('2019 40', 'yyyy mm')| +----------------------------------+ | 2019-01-01 00:40:00| +----------------------------------+ scala> sql("select to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')").show +----------------------------------------------+ |to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')| +----------------------------------------------+ | 2019-01-01 10:10:10| +----------------------------------------------+ ``` in branch 3.0 ``` scala> sql("select to_timestamp('16', 'dd')").show +--------------------+ |to_timestamp(16, dd)| +--------------------+ | null| +--------------------+ scala> sql("select to_date('16', 'dd')").show +---------------+ |to_date(16, dd)| +---------------+ | null| +---------------+ scala> sql("select to_timestamp('2019 40', 'yyyy mm')").show +------------------------------+ |to_timestamp(2019 40, yyyy mm)| +------------------------------+ | 2019-01-01 00:00:00| +------------------------------+ scala> sql("select to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')").show +------------------------------------------+ |to_timestamp(2019 10:10:10, yyyy hh:mm:ss)| +------------------------------------------+ | 2019-01-01 00:00:00| +------------------------------------------+ ``` After this PR, the behavior becomes the same as 2.4, if the legacy config is enabled. new tests Closes #28576 from cloud-fan/bug. Authored-by: Wenchen Fan Signed-off-by: HyukjinKwon --- .../sql/catalyst/util/DateFormatter.scala | 2 +- .../util/DateTimeFormatterHelper.scala | 77 ++++++---- .../catalyst/csv/UnivocityParserSuite.scala | 14 +- .../sql/catalyst/util/DateTimeTestUtils.scala | 8 +- .../catalyst/util/DateTimeUtilsSuite.scala | 4 +- .../spark/sql/util/DateFormatterSuite.scala | 46 ++++-- .../sql/util/TimestampFormatterSuite.scala | 139 ++++++++++++++---- .../resources/sql-tests/inputs/datetime.sql | 12 +- .../sql-tests/inputs/json-functions.sql | 15 ++ .../sql-tests/results/ansi/datetime.sql.out | 50 ++++++- .../sql-tests/results/datetime.sql.out | 50 ++++++- .../sql-tests/results/json-functions.sql.out | 45 +++++- 12 files changed, 370 insertions(+), 92 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala index 7d94955096196..8261f57916fa2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala @@ -53,7 +53,7 @@ class Iso8601DateFormatter( val specialDate = convertSpecialDate(s.trim, zoneId) specialDate.getOrElse { try { - val localDate = LocalDate.parse(s, formatter) + val localDate = toLocalDate(formatter.parse(s)) localDateToDays(localDate) } catch checkDiffResult(s, legacyFormatter.parse) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala index 05ec23f7ad479..35f95dbffca6e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util import java.time._ import java.time.chrono.IsoChronology -import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, DateTimeParseException, ResolverStyle} +import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle} import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries} import java.util.Locale @@ -31,17 +31,52 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._ trait DateTimeFormatterHelper { + private def getOrDefault(accessor: TemporalAccessor, field: ChronoField, default: Int): Int = { + if (accessor.isSupported(field)) { + accessor.get(field) + } else { + default + } + } + + protected def toLocalDate(accessor: TemporalAccessor): LocalDate = { + val localDate = accessor.query(TemporalQueries.localDate()) + // If all the date fields are specified, return the local date directly. + if (localDate != null) return localDate + + // Users may want to parse only a few datetime fields from a string and extract these fields + // later, and we should provide default values for missing fields. + // To be compatible with Spark 2.4, we pick 1970 as the default value of year. + val year = getOrDefault(accessor, ChronoField.YEAR, 1970) + val month = getOrDefault(accessor, ChronoField.MONTH_OF_YEAR, 1) + val day = getOrDefault(accessor, ChronoField.DAY_OF_MONTH, 1) + LocalDate.of(year, month, day) + } + + private def toLocalTime(accessor: TemporalAccessor): LocalTime = { + val localTime = accessor.query(TemporalQueries.localTime()) + // If all the time fields are specified, return the local time directly. + if (localTime != null) return localTime + + val hour = if (accessor.isSupported(ChronoField.HOUR_OF_DAY)) { + accessor.get(ChronoField.HOUR_OF_DAY) + } else if (accessor.isSupported(ChronoField.HOUR_OF_AMPM)) { + // When we reach here, it means am/pm is not specified. Here we assume it's am. + accessor.get(ChronoField.HOUR_OF_AMPM) + } else { + 0 + } + val minute = getOrDefault(accessor, ChronoField.MINUTE_OF_HOUR, 0) + val second = getOrDefault(accessor, ChronoField.SECOND_OF_MINUTE, 0) + val nanoSecond = getOrDefault(accessor, ChronoField.NANO_OF_SECOND, 0) + LocalTime.of(hour, minute, second, nanoSecond) + } + // Converts the parsed temporal object to ZonedDateTime. It sets time components to zeros // if they does not exist in the parsed object. - protected def toZonedDateTime( - temporalAccessor: TemporalAccessor, - zoneId: ZoneId): ZonedDateTime = { - // Parsed input might not have time related part. In that case, time component is set to zeros. - val parsedLocalTime = temporalAccessor.query(TemporalQueries.localTime) - val localTime = if (parsedLocalTime == null) LocalTime.MIDNIGHT else parsedLocalTime - // Parsed input must have date component. At least, year must present in temporalAccessor. - val localDate = temporalAccessor.query(TemporalQueries.localDate) - + protected def toZonedDateTime(accessor: TemporalAccessor, zoneId: ZoneId): ZonedDateTime = { + val localDate = toLocalDate(accessor) + val localTime = toLocalTime(accessor) ZonedDateTime.of(localDate, localTime, zoneId) } @@ -72,19 +107,15 @@ trait DateTimeFormatterHelper { // DateTimeParseException will address by the caller side. protected def checkDiffResult[T]( s: String, legacyParseFunc: String => T): PartialFunction[Throwable, T] = { - case e: DateTimeParseException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION => - val res = try { - Some(legacyParseFunc(s)) + case e: DateTimeException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION => + try { + legacyParseFunc(s) } catch { - case _: Throwable => None - } - if (res.nonEmpty) { - throw new SparkUpgradeException("3.0", s"Fail to parse '$s' in the new parser. You can " + - s"set ${SQLConf.LEGACY_TIME_PARSER_POLICY.key} to LEGACY to restore the behavior " + - s"before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.", e) - } else { - throw e + case _: Throwable => throw e } + throw new SparkUpgradeException("3.0", s"Fail to parse '$s' in the new parser. You can " + + s"set ${SQLConf.LEGACY_TIME_PARSER_POLICY.key} to LEGACY to restore the behavior " + + s"before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.", e) } } @@ -101,10 +132,6 @@ private object DateTimeFormatterHelper { def toFormatter(builder: DateTimeFormatterBuilder, locale: Locale): DateTimeFormatter = { builder - .parseDefaulting(ChronoField.MONTH_OF_YEAR, 1) - .parseDefaulting(ChronoField.DAY_OF_MONTH, 1) - .parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0) - .parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0) .toFormatter(locale) .withChronology(IsoChronology.INSTANCE) .withResolverStyle(ResolverStyle.STRICT) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala index 4853b4f162be0..474bb53c24564 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala @@ -325,7 +325,7 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45") == date(2020, 1, 12, 12, 3, 45, 0)) assert(parser.makeConverter("t", DateType).apply("2020-1-12") == - days(2020, 1, 12, 0, 0, 0)) + days(2020, 1, 12)) // The legacy format allows arbitrary length of second fraction. assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45.1") == date(2020, 1, 12, 12, 3, 45, 100000)) @@ -333,22 +333,22 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { date(2020, 1, 12, 12, 3, 45, 123400)) // The legacy format allow date string to end with T or space, with arbitrary string assert(parser.makeConverter("t", DateType).apply("2020-1-12T") == - days(2020, 1, 12, 0, 0, 0)) + days(2020, 1, 12)) assert(parser.makeConverter("t", DateType).apply("2020-1-12Txyz") == - days(2020, 1, 12, 0, 0, 0)) + days(2020, 1, 12)) assert(parser.makeConverter("t", DateType).apply("2020-1-12 ") == - days(2020, 1, 12, 0, 0, 0)) + days(2020, 1, 12)) assert(parser.makeConverter("t", DateType).apply("2020-1-12 xyz") == - days(2020, 1, 12, 0, 0, 0)) + days(2020, 1, 12)) // The legacy format ignores the "GMT" from the string assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45GMT") == date(2020, 1, 12, 12, 3, 45, 0)) assert(parser.makeConverter("t", TimestampType).apply("GMT2020-1-12 12:3:45") == date(2020, 1, 12, 12, 3, 45, 0)) assert(parser.makeConverter("t", DateType).apply("2020-1-12GMT") == - days(2020, 1, 12, 0, 0, 0)) + days(2020, 1, 12)) assert(parser.makeConverter("t", DateType).apply("GMT2020-1-12") == - days(2020, 1, 12, 0, 0, 0)) + days(2020, 1, 12)) } val options = new CSVOptions(Map.empty[String, String], false, "UTC") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala index bf9e8f71ba1c9..66aef1b4b6cb0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala @@ -88,12 +88,8 @@ object DateTimeTestUtils { def days( year: Int, month: Byte = 1, - day: Byte = 1, - hour: Byte = 0, - minute: Byte = 0, - sec: Byte = 0): Int = { - val micros = date(year, month, day, hour, minute, sec) - TimeUnit.MICROSECONDS.toDays(micros).toInt + day: Byte = 1): Int = { + LocalDate.of(year, month, day).toEpochDay.toInt } // Returns microseconds since epoch for current date and give time diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index b547c445f06fe..d526ae18ab65e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -386,13 +386,13 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { } test("date add months") { - val input = days(1997, 2, 28, 10, 30) + val input = days(1997, 2, 28) assert(dateAddMonths(input, 36) === days(2000, 2, 28)) assert(dateAddMonths(input, -13) === days(1996, 1, 28)) } test("date add interval with day precision") { - val input = days(1997, 2, 28, 10, 30) + val input = days(1997, 2, 28) assert(dateAddInterval(input, new CalendarInterval(36, 0, 0)) === days(2000, 2, 28)) assert(dateAddInterval(input, new CalendarInterval(36, 47, 0)) === days(2000, 4, 15)) assert(dateAddInterval(input, new CalendarInterval(-13, 0, 0)) === days(1996, 1, 28)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala index 3954b9b8355c8..7d503cc091172 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala @@ -17,18 +17,19 @@ package org.apache.spark.sql.util -import java.time.{DateTimeException, LocalDate, ZoneOffset} +import java.time.{DateTimeException, LocalDate} import org.apache.spark.{SparkFunSuite, SparkUpgradeException} import org.apache.spark.sql.catalyst.plans.SQLHelper -import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyDateFormats} +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy class DateFormatterSuite extends SparkFunSuite with SQLHelper { test("parsing dates") { - DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone => + outstandingTimezonesIds.foreach { timeZone => withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) { val formatter = DateFormatter(getZoneId(timeZone)) val daysSinceEpoch = formatter.parse("2018-12-02") @@ -38,7 +39,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { } test("format dates") { - DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone => + outstandingTimezonesIds.foreach { timeZone => withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) { val formatter = DateFormatter(getZoneId(timeZone)) val (days, expected) = (17867, "2018-12-02") @@ -65,7 +66,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { "2018-12-12", "2038-01-01", "5010-11-17").foreach { date => - DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone => + outstandingTimezonesIds.foreach { timeZone => withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) { val formatter = DateFormatter( DateFormatter.defaultPattern, @@ -99,7 +100,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { 17877, 24837, 1110657).foreach { days => - DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone => + outstandingTimezonesIds.foreach { timeZone => withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) { val formatter = DateFormatter( DateFormatter.defaultPattern, @@ -118,14 +119,14 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { } test("parsing date without explicit day") { - val formatter = DateFormatter("yyyy MMM", ZoneOffset.UTC) + val formatter = DateFormatter("yyyy MMM", UTC) val daysSinceEpoch = formatter.parse("2018 Dec") - assert(daysSinceEpoch === LocalDate.of(2018, 12, 1).toEpochDay) + assert(daysSinceEpoch === days(2018, 12, 1)) } test("formatting negative years with default pattern") { - val epochDays = LocalDate.of(-99, 1, 1).toEpochDay.toInt - assert(DateFormatter(ZoneOffset.UTC).format(epochDays) === "-0099-01-01") + val epochDays = days(-99, 1, 1) + assert(DateFormatter(UTC).format(epochDays) === "-0099-01-01") } test("special date values") { @@ -142,8 +143,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { } test("SPARK-30958: parse date with negative year") { - val formatter1 = DateFormatter("yyyy-MM-dd", ZoneOffset.UTC) - assert(formatter1.parse("-1234-02-22") === localDateToDays(LocalDate.of(-1234, 2, 22))) + val formatter1 = DateFormatter("yyyy-MM-dd", UTC) + assert(formatter1.parse("-1234-02-22") === days(-1234, 2, 22)) def assertParsingError(f: => Unit): Unit = { intercept[Exception](f) match { @@ -155,18 +156,18 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { } // "yyyy" with "G" can't parse negative year or year 0000. - val formatter2 = DateFormatter("G yyyy-MM-dd", ZoneOffset.UTC) + val formatter2 = DateFormatter("G yyyy-MM-dd", UTC) assertParsingError(formatter2.parse("BC -1234-02-22")) assertParsingError(formatter2.parse("AD 0000-02-22")) - assert(formatter2.parse("BC 1234-02-22") === localDateToDays(LocalDate.of(-1233, 2, 22))) - assert(formatter2.parse("AD 1234-02-22") === localDateToDays(LocalDate.of(1234, 2, 22))) + assert(formatter2.parse("BC 1234-02-22") === days(-1233, 2, 22)) + assert(formatter2.parse("AD 1234-02-22") === days(1234, 2, 22)) } test("SPARK-31557: rebasing in legacy formatters/parsers") { withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> LegacyBehaviorPolicy.LEGACY.toString) { LegacyDateFormats.values.foreach { legacyFormat => - DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone => + outstandingTimezonesIds.foreach { timeZone => withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) { val formatter = DateFormatter( DateFormatter.defaultPattern, @@ -182,4 +183,17 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { } } } + + test("missing date fields") { + val formatter = DateFormatter("HH", UTC) + val daysSinceEpoch = formatter.parse("20") + assert(daysSinceEpoch === days(1970, 1, 1)) + } + + test("missing year field with invalid date") { + val formatter = DateFormatter("MM-dd", UTC) + // The date parser in 2.4 accepts 1970-02-29 and turn it into 1970-03-01, so we should get a + // SparkUpgradeException here. + intercept[SparkUpgradeException](formatter.parse("02-29")) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala index b467e24b5301a..dccb3defe3728 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala @@ -17,15 +17,15 @@ package org.apache.spark.sql.util -import java.time.{DateTimeException, Instant, LocalDateTime, LocalTime, ZoneOffset} +import java.time.{DateTimeException, Instant, LocalDateTime, LocalTime} import java.util.concurrent.TimeUnit import org.scalatest.Matchers import org.apache.spark.{SparkFunSuite, SparkUpgradeException} import org.apache.spark.sql.catalyst.plans.SQLHelper -import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils, LegacyDateFormats, TimestampFormatter} -import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{CET, PST, UTC} +import org.apache.spark.sql.catalyst.util.{LegacyDateFormats, TimestampFormatter} +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy @@ -44,10 +44,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers "Antarctica/Vostok" -> 1543723872001234L, "Asia/Hong_Kong" -> 1543716672001234L, "Europe/Amsterdam" -> 1543741872001234L) - DateTimeTestUtils.outstandingTimezonesIds.foreach { zoneId => + outstandingTimezonesIds.foreach { zoneId => val formatter = TimestampFormatter( "yyyy-MM-dd'T'HH:mm:ss.SSSSSS", - DateTimeUtils.getZoneId(zoneId), + getZoneId(zoneId), needVarLengthSecondFraction = true) val microsSinceEpoch = formatter.parse(localDate) assert(microsSinceEpoch === expectedMicros(zoneId)) @@ -65,7 +65,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers "Antarctica/Vostok" -> "2018-12-02 16:11:12.001234", "Asia/Hong_Kong" -> "2018-12-02 18:11:12.001234", "Europe/Amsterdam" -> "2018-12-02 11:11:12.001234") - DateTimeTestUtils.outstandingTimezonesIds.foreach { zoneId => + outstandingTimezonesIds.foreach { zoneId => Seq( TimestampFormatter( "yyyy-MM-dd HH:mm:ss.SSSSSS", @@ -95,7 +95,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers 1543749753123456L, 2177456523456789L, 11858049903010203L).foreach { micros => - DateTimeTestUtils.outstandingZoneIds.foreach { zoneId => + outstandingZoneIds.foreach { zoneId => val timestamp = TimestampFormatter(pattern, zoneId).format(micros) val parsed = TimestampFormatter( pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp) @@ -116,7 +116,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers "2018-12-02T11:22:33.123456", "2039-01-01T01:02:03.456789", "2345-10-07T22:45:03.010203").foreach { timestamp => - DateTimeTestUtils.outstandingZoneIds.foreach { zoneId => + outstandingZoneIds.foreach { zoneId => val pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSSSS" val micros = TimestampFormatter( pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp) @@ -127,10 +127,9 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers } test("case insensitive parsing of am and pm") { - val formatter = TimestampFormatter("yyyy MMM dd hh:mm:ss a", ZoneOffset.UTC) + val formatter = TimestampFormatter("yyyy MMM dd hh:mm:ss a", UTC) val micros = formatter.parse("2009 Mar 20 11:30:01 am") - assert(micros === TimeUnit.SECONDS.toMicros( - LocalDateTime.of(2009, 3, 20, 11, 30, 1).toEpochSecond(ZoneOffset.UTC))) + assert(micros === date(2009, 3, 20, 11, 30, 1)) } test("format fraction of second") { @@ -143,7 +142,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers 1000000 -> "1970-01-01 00:00:01").foreach { case (micros, tsStr) => assert(formatter.format(micros) === tsStr) assert(formatter.format(microsToInstant(micros)) === tsStr) - DateTimeTestUtils.withDefaultTimeZone(UTC) { + withDefaultTimeZone(UTC) { assert(formatter.format(toJavaTimestamp(micros)) === tsStr) } } @@ -151,10 +150,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers test("formatting negative years with default pattern") { val instant = LocalDateTime.of(-99, 1, 1, 0, 0, 0).atZone(UTC).toInstant - val micros = DateTimeUtils.instantToMicros(instant) + val micros = instantToMicros(instant) assert(TimestampFormatter(UTC).format(micros) === "-0099-01-01 00:00:00") assert(TimestampFormatter(UTC).format(instant) === "-0099-01-01 00:00:00") - DateTimeTestUtils.withDefaultTimeZone(UTC) { // toJavaTimestamp depends on the default time zone + withDefaultTimeZone(UTC) { // toJavaTimestamp depends on the default time zone assert(TimestampFormatter("yyyy-MM-dd HH:mm:SS G", UTC).format(toJavaTimestamp(micros)) === "0100-01-01 00:00:00 BC") } @@ -181,11 +180,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers } test("parsing timestamp strings with various seconds fractions") { - DateTimeTestUtils.outstandingZoneIds.foreach { zoneId => + outstandingZoneIds.foreach { zoneId => def check(pattern: String, input: String, reference: String): Unit = { val formatter = TimestampFormatter(pattern, zoneId, needVarLengthSecondFraction = true) - val expected = DateTimeUtils.stringToTimestamp( - UTF8String.fromString(reference), zoneId).get + val expected = stringToTimestamp(UTF8String.fromString(reference), zoneId).get val actual = formatter.parse(input) assert(actual === expected) } @@ -219,11 +217,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers } test("formatting timestamp strings up to microsecond precision") { - DateTimeTestUtils.outstandingZoneIds.foreach { zoneId => + outstandingZoneIds.foreach { zoneId => def check(pattern: String, input: String, expected: String): Unit = { val formatter = TimestampFormatter(pattern, zoneId) - val timestamp = DateTimeUtils.stringToTimestamp( - UTF8String.fromString(input), zoneId).get + val timestamp = stringToTimestamp(UTF8String.fromString(input), zoneId).get val actual = formatter.format(timestamp) assert(actual === expected) } @@ -259,9 +256,8 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers } test("SPARK-30958: parse timestamp with negative year") { - val formatter1 = TimestampFormatter("yyyy-MM-dd HH:mm:ss", ZoneOffset.UTC, true) - assert(formatter1.parse("-1234-02-22 02:22:22") === instantToMicros( - LocalDateTime.of(-1234, 2, 22, 2, 22, 22).toInstant(ZoneOffset.UTC))) + val formatter1 = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC, true) + assert(formatter1.parse("-1234-02-22 02:22:22") === date(-1234, 2, 22, 2, 22, 22)) def assertParsingError(f: => Unit): Unit = { intercept[Exception](f) match { @@ -277,17 +273,15 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers assertParsingError(formatter2.parse("BC -1234-02-22 02:22:22")) assertParsingError(formatter2.parse("AC 0000-02-22 02:22:22")) - assert(formatter2.parse("BC 1234-02-22 02:22:22") === instantToMicros( - LocalDateTime.of(-1233, 2, 22, 2, 22, 22).toInstant(ZoneOffset.UTC))) - assert(formatter2.parse("AD 1234-02-22 02:22:22") === instantToMicros( - LocalDateTime.of(1234, 2, 22, 2, 22, 22).toInstant(ZoneOffset.UTC))) + assert(formatter2.parse("BC 1234-02-22 02:22:22") === date(-1233, 2, 22, 2, 22, 22)) + assert(formatter2.parse("AD 1234-02-22 02:22:22") === date(1234, 2, 22, 2, 22, 22)) } test("SPARK-31557: rebasing in legacy formatters/parsers") { withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> LegacyBehaviorPolicy.LEGACY.toString) { - DateTimeTestUtils.outstandingZoneIds.foreach { zoneId => + outstandingZoneIds.foreach { zoneId => withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> zoneId.getId) { - DateTimeTestUtils.withDefaultTimeZone(zoneId) { + withDefaultTimeZone(zoneId) { withClue(s"zoneId = ${zoneId.getId}") { val formatters = LegacyDateFormats.values.map { legacyFormat => TimestampFormatter( @@ -296,7 +290,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers TimestampFormatter.defaultLocale, legacyFormat, needVarLengthSecondFraction = false) - }.toSeq :+ TimestampFormatter.getFractionFormatter(zoneId) + }.toSeq :+ TimestampFormatter.getFractionFormatter(zoneId) formatters.foreach { formatter => assert(microsToInstant(formatter.parse("1000-01-01 01:02:03")) .atZone(zoneId) @@ -317,4 +311,89 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers } } } + + test("parsing hour with various patterns") { + def createFormatter(pattern: String): TimestampFormatter = { + // Use `SIMPLE_DATE_FORMAT`, so that the legacy parser also fails with invalid value range. + TimestampFormatter(pattern, UTC, LegacyDateFormats.SIMPLE_DATE_FORMAT, false) + } + + withClue("HH") { + val formatter = createFormatter("yyyy-MM-dd HH") + + val micros1 = formatter.parse("2009-12-12 00") + assert(micros1 === date(2009, 12, 12)) + + val micros2 = formatter.parse("2009-12-12 15") + assert(micros2 === date(2009, 12, 12, 15)) + + intercept[DateTimeException](formatter.parse("2009-12-12 24")) + } + + withClue("kk") { + val formatter = createFormatter("yyyy-MM-dd kk") + + intercept[DateTimeException](formatter.parse("2009-12-12 00")) + + val micros1 = formatter.parse("2009-12-12 15") + assert(micros1 === date(2009, 12, 12, 15)) + + val micros2 = formatter.parse("2009-12-12 24") + assert(micros2 === date(2009, 12, 12)) + } + + withClue("KK") { + val formatter = createFormatter("yyyy-MM-dd KK a") + + val micros1 = formatter.parse("2009-12-12 00 am") + assert(micros1 === date(2009, 12, 12)) + + // For `KK`, "12:00:00 am" is the same as "00:00:00 pm". + val micros2 = formatter.parse("2009-12-12 12 am") + assert(micros2 === date(2009, 12, 12, 12)) + + val micros3 = formatter.parse("2009-12-12 00 pm") + assert(micros3 === date(2009, 12, 12, 12)) + + intercept[DateTimeException](formatter.parse("2009-12-12 12 pm")) + } + + withClue("hh") { + val formatter = createFormatter("yyyy-MM-dd hh a") + + intercept[DateTimeException](formatter.parse("2009-12-12 00 am")) + + val micros1 = formatter.parse("2009-12-12 12 am") + assert(micros1 === date(2009, 12, 12)) + + intercept[DateTimeException](formatter.parse("2009-12-12 00 pm")) + + val micros2 = formatter.parse("2009-12-12 12 pm") + assert(micros2 === date(2009, 12, 12, 12)) + } + } + + test("missing date fields") { + val formatter = TimestampFormatter("HH:mm:ss", UTC) + val micros = formatter.parse("11:30:01") + assert(micros === date(1970, 1, 1, 11, 30, 1)) + } + + test("missing year field with invalid date") { + // Use `SIMPLE_DATE_FORMAT`, so that the legacy parser also fails with invalid date. + val formatter = TimestampFormatter("MM-dd", UTC, LegacyDateFormats.SIMPLE_DATE_FORMAT, false) + withDefaultTimeZone(UTC)(intercept[DateTimeException](formatter.parse("02-29"))) + } + + test("missing am/pm field") { + val formatter = TimestampFormatter("yyyy hh:mm:ss", UTC) + val micros = formatter.parse("2009 11:30:01") + assert(micros === date(2009, 1, 1, 11, 30, 1)) + } + + test("missing time fields") { + val formatter = TimestampFormatter("yyyy HH", UTC) + val micros = formatter.parse("2009 11") + assert(micros === date(2009, 1, 1, 11)) + } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index fd3325085df96..9be857ef767df 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -86,7 +86,7 @@ select date_sub('2011-11-11', str) from v; select null - date '2019-10-06'; select date '2001-10-01' - date '2001-09-28'; --- variable-length tests +-- variable-length second fraction tests select to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); select to_timestamp('2019-10-06 10:11:12.0', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); select to_timestamp('2019-10-06 10:11:12.1', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); @@ -95,7 +95,7 @@ select to_timestamp('2019-10-06 10:11:12.123UTC', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zz select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); select to_timestamp('2019-10-06 10:11:12.12345CST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); select to_timestamp('2019-10-06 10:11:12.123456PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); --- exceeded max variable length +-- second fraction exceeded max variable length select to_timestamp('2019-10-06 10:11:12.1234567PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); -- special cases select to_timestamp('123456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); @@ -122,3 +122,11 @@ select to_timestamp("2019-10-06T10:11:12'12", "yyyy-MM-dd'T'HH:mm:ss''SSSS"); -- select to_timestamp("2019-10-06T10:11:12'", "yyyy-MM-dd'T'HH:mm:ss''"); -- tail select to_timestamp("'2019-10-06T10:11:12", "''yyyy-MM-dd'T'HH:mm:ss"); -- head select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss"); -- head but as single quote + +-- missing fields +select to_timestamp("16", "dd"); +select to_timestamp("02-29", "MM-dd"); +select to_date("16", "dd"); +select to_date("02-29", "MM-dd"); +select to_timestamp("2019 40", "yyyy mm"); +select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss"); diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index 6c14eee2e4e61..5bd78f5f6af3a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -48,6 +48,21 @@ select from_json('[null, {"a":2}]', 'array>'); select from_json('[{"a": 1}, {"b":2}]', 'array>'); select from_json('[{"a": 1}, 2]', 'array>'); +-- from_json - datetime type +select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp'); +select from_json( + '{"d": "12/15 2012", "t": "12/15 2012 15:15:15"}', + 'd date, t timestamp', + map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss')); +select from_json( + '{"d": "02-29"}', + 'd date', + map('dateFormat', 'MM-dd')); +select from_json( + '{"t": "02-29"}', + 't timestamp', + map('timestampFormat', 'MM-dd')); + -- to_json - array type select to_json(array('1', '2', '3')); select to_json(array(array(1, 2, 3), array(4))); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index 81a73a6377715..8a75c2bb886ea 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 85 +-- Number of queries: 91 -- !query @@ -730,3 +730,51 @@ select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss") struct -- !query output 2019-10-06 10:11:12 + + +-- !query +select to_timestamp("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 00:00:00 + + +-- !query +select to_timestamp("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_date("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 + + +-- !query +select to_date("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2019 40", "yyyy mm") +-- !query schema +struct +-- !query output +2019-01-01 00:40:00 + + +-- !query +select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss") +-- !query schema +struct +-- !query output +2019-01-01 10:10:10 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 2e600850c48b9..346263e37d6b0 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 85 +-- Number of queries: 91 -- !query @@ -702,3 +702,51 @@ select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss") struct -- !query output 2019-10-06 10:11:12 + + +-- !query +select to_timestamp("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 00:00:00 + + +-- !query +select to_timestamp("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_date("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 + + +-- !query +select to_date("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2019 40", "yyyy mm") +-- !query schema +struct +-- !query output +2019-01-01 00:40:00 + + +-- !query +select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss") +-- !query schema +struct +-- !query output +2019-01-01 10:10:10 diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index 21a3531caf732..665c79c4753bc 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 42 +-- Number of queries: 46 -- !query @@ -288,6 +288,49 @@ struct>> NULL +-- !query +select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp') +-- !query schema +struct> +-- !query output +{"d":2012-12-15,"t":2012-12-15 15:15:15} + + +-- !query +select from_json( + '{"d": "12/15 2012", "t": "12/15 2012 15:15:15"}', + 'd date, t timestamp', + map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss')) +-- !query schema +struct> +-- !query output +{"d":2012-12-15,"t":2012-12-15 15:15:15} + + +-- !query +select from_json( + '{"d": "02-29"}', + 'd date', + map('dateFormat', 'MM-dd')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '02-29' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select from_json( + '{"t": "02-29"}', + 't timestamp', + map('timestampFormat', 'MM-dd')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '02-29' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + -- !query select to_json(array('1', '2', '3')) -- !query schema