Merge remote-tracking branch 'remotes/origin/master' into legacy-date…

…-formatter-time-zone # Conflicts: # sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala
apache · Jun 5, 2020 · 6535b24 · 6535b24
2 parents ee89265 + fc6af9d
commit 6535b24
Show file tree

Hide file tree

Showing 26 changed files with 1,515 additions and 201 deletions.
diff --git a/docs/pyspark-migration-guide.md b/docs/pyspark-migration-guide.md
@@ -45,6 +45,8 @@ Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide.
 
 - As of Spark 3.0, `Row` field names are no longer sorted alphabetically when constructing with named arguments for Python versions 3.6 and above, and the order of fields will match that as entered. To enable sorted fields by default, as in Spark 2.4, set the environment variable `PYSPARK_ROW_FIELD_SORTING_ENABLED` to `true` for both executors and driver - this environment variable must be consistent on all executors and driver; otherwise, it may cause failures or incorrect answers. For Python versions less than 3.6, the field names will be sorted alphabetically as the only option.
 
+- In Spark 3.0, `pyspark.ml.param.shared.Has*` mixins do not provide any `set*(self, value)` setter methods anymore, use the respective `self.set(self.*, value)` instead. See [SPARK-29093](https://issues.apache.org/jira/browse/SPARK-29093) for details.
+
 ## Upgrading from PySpark 2.3 to 2.4
 
   - In PySpark, when Arrow optimization is enabled, previously `toPandas` just failed when Arrow optimization is unable to be used whereas `createDataFrame` from Pandas DataFrame allowed the fallback to non-optimization. Now, both `toPandas` and `createDataFrame` from Pandas DataFrame allow the fallback by default, which can be switched off by `spark.sql.execution.arrow.fallback.enabled`.

diff --git a/docs/sql-ref-datetime-pattern.md b/docs/sql-ref-datetime-pattern.md
@@ -36,11 +36,7 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 |**M/L**|month-of-year|month|7; 07; Jul; July|
 |**d**|day-of-month|number(3)|28|
 |**Q/q**|quarter-of-year|number/text|3; 03; Q3; 3rd quarter|
-|**Y**|week-based-year|year|1996; 96|
-|**w**|week-of-week-based-year|number(2)|27|
-|**W**|week-of-month|number(1)|4|
 |**E**|day-of-week|text|Tue; Tuesday|
-|**u**|localized day-of-week|number/text|2; 02; Tue; Tuesday|
 |**F**|week-of-month|number(1)|3|
 |**a**|am-pm-of-day|am-pm|PM|
 |**h**|clock-hour-of-am-pm (1-12)|number(2)|12|
@@ -63,7 +59,7 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 
 The count of pattern letters determines the format.
 
-- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short form. Exactly 4 pattern letters will use the full form. Exactly 5 pattern letters will use the narrow form. 5 or more letters will fail.
+- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short text form, typically an abbreviation, e.g. day-of-week Monday might output "Mon". Exactly 4 pattern letters will use the full text form, typically the full description, e.g, day-of-week Monday might output "Monday". 5 or more letters will fail.
 
 - Number(n): The n here represents the maximum count of letters this type of datetime pattern can be used. If the count of letters is one, then the value is output using the minimum number of digits and without padding. Otherwise, the count of digits is used as the width of the output field, with the value zero-padded as necessary.
 
@@ -137,10 +133,4 @@ The count of pattern letters determines the format.
   During parsing, the whole section may be missing from the parsed string.
   An optional section is started by `[` and ended using `]` (or at the end of the pattern).
 
-- Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
-
-More details for the text style:
-
-- Short Form: Short text, typically an abbreviation. For example, day-of-week Monday might output "Mon".
-
-- Full Form: Full text, typically the full description. For example, day-of-week Monday might output "Monday".
+- Symbols of 'E', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
@@ -44,7 +44,7 @@ def __str__(self):
         debug_enabled = sql_conf.pysparkJVMStacktraceEnabled()
         desc = self.desc
         if debug_enabled:
-            desc = desc + "\nJVM stacktrace:\n%s" % self.stackTrace
+            desc = desc + "\n\nJVM stacktrace:\n%s" % self.stackTrace
         # encode unicode instance for python2 for human readable description
         if sys.version_info.major < 3 and isinstance(desc, unicode):
             return str(desc.encode('utf-8'))

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -46,7 +46,7 @@ class Iso8601DateFormatter(
   extends DateFormatter with DateTimeFormatterHelper {
 
   @transient
-  private lazy val formatter = getOrCreateFormatter(pattern, locale)
+  private lazy val formatter = getOrCreateFormatter(pattern, locale, isParsing)
 
   @transient
   private lazy val legacyFormatter = DateFormatter.getLegacyFormatter(
@@ -58,12 +58,15 @@ class Iso8601DateFormatter(
       try {
         val localDate = toLocalDate(formatter.parse(s))
         localDateToDays(localDate)
-      } catch checkDiffResult(s, legacyFormatter.parse)
+      } catch checkParsedDiff(s, legacyFormatter.parse)
     }
   }
 
   override def format(localDate: LocalDate): String = {
-    localDate.format(formatter)
+    try {
+      localDate.format(formatter)
+    } catch checkFormattedDiff(toJavaDate(localDateToDays(localDate)),
+      (d: Date) => format(d))
   }
 
   override def format(days: Int): String = {
@@ -137,7 +140,7 @@ object DateFormatter {
       zoneId: ZoneId,
       locale: Locale = defaultLocale,
       legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
-      isParsing: Boolean = true): DateFormatter = {
+      isParsing: Boolean): DateFormatter = {
     val pattern = format.getOrElse(defaultPattern)
     if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
       getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
@@ -170,11 +173,11 @@ object DateFormatter {
     getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
   }
 
-  def apply(format: String, zoneId: ZoneId): DateFormatter = {
-    getFormatter(Some(format), zoneId)
+  def apply(format: String, zoneId: ZoneId, isParsing: Boolean = false): DateFormatter = {
+    getFormatter(Some(format), zoneId, isParsing = isParsing)
   }
 
   def apply(zoneId: ZoneId): DateFormatter = {
-    getFormatter(None, zoneId)
+    getFormatter(None, zoneId, isParsing = false)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
@@ -21,7 +21,7 @@ import java.time._
 import java.time.chrono.IsoChronology
 import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle}
 import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries}
-import java.util.Locale
+import java.util.{Date, Locale}
 
 import com.google.common.cache.CacheBuilder
 
@@ -97,7 +97,7 @@ trait DateTimeFormatterHelper {
   protected def getOrCreateFormatter(
       pattern: String,
       locale: Locale,
-      isParsing: Boolean = false): DateTimeFormatter = {
+      isParsing: Boolean): DateTimeFormatter = {
     val newPattern = convertIncompatiblePattern(pattern, isParsing)
     val useVarLen = isParsing && newPattern.contains('S')
     val key = (newPattern, locale, useVarLen)
@@ -109,13 +109,17 @@ trait DateTimeFormatterHelper {
     formatter
   }
 
+  private def needConvertToSparkUpgradeException(e: Throwable): Boolean = e match {
+    case _: DateTimeException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION => true
+    case _ => false
+  }
   // When legacy time parser policy set to EXCEPTION, check whether we will get different results
   // between legacy parser and new parser. If new parser fails but legacy parser works, throw a
   // SparkUpgradeException. On the contrary, if the legacy policy set to CORRECTED,
   // DateTimeParseException will address by the caller side.
-  protected def checkDiffResult[T](
+  protected def checkParsedDiff[T](
       s: String, legacyParseFunc: String => T): PartialFunction[Throwable, T] = {
-    case e: DateTimeException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION =>
+    case e if needConvertToSparkUpgradeException(e) =>
       try {
         legacyParseFunc(s)
       } catch {
@@ -126,6 +130,25 @@ trait DateTimeFormatterHelper {
         s"before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.", e)
   }
 
+  // When legacy time parser policy set to EXCEPTION, check whether we will get different results
+  // between legacy formatter and new formatter. If new formatter fails but legacy formatter works,
+  // throw a SparkUpgradeException. On the contrary, if the legacy policy set to CORRECTED,
+  // DateTimeParseException will address by the caller side.
+  protected def checkFormattedDiff[T <: Date](
+      d: T,
+      legacyFormatFunc: T => String): PartialFunction[Throwable, String] = {
+    case e if needConvertToSparkUpgradeException(e) =>
+      val resultCandidate = try {
+        legacyFormatFunc(d)
+      } catch {
+        case _: Throwable => throw e
+      }
+      throw new SparkUpgradeException("3.0", s"Fail to format it to '$resultCandidate' in the new" +
+        s" formatter. You can set ${SQLConf.LEGACY_TIME_PARSER_POLICY.key} to LEGACY to restore" +
+        " the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid" +
+        " datetime string.", e)
+  }
+
   /**
    * When the new DateTimeFormatter failed to initialize because of invalid datetime pattern, it
    * will throw IllegalArgumentException. If the pattern can be recognized by the legacy formatter
@@ -137,7 +160,6 @@ trait DateTimeFormatterHelper {
    * @param tryLegacyFormatter a func to capture exception, identically which forces a legacy
    *                           datetime formatter to be initialized
    */
-
   protected def checkLegacyFormatter(
       pattern: String,
       tryLegacyFormatter: => Unit): PartialFunction[Throwable, DateTimeFormatter] = {
@@ -234,22 +256,27 @@ private object DateTimeFormatterHelper {
     val formatter = DateTimeFormatter.ofPattern("LLL qqq", Locale.US)
     formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
   }
-  final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
   // SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
-  // to datetime, especially when they are mixed with other non-week-based ones
+  // to datetime, especially when they are mixed with other non-week-based ones;
+  // SPARK-31879: It's also difficult for us to restore the behavior of week-based date fields
+  // formatting, in DateTimeFormatter the first day of week for week-based date fields become
+  // localized, for the default Locale.US, it uses Sunday as the first day of week, while in Spark
+  // 2.4, the SimpleDateFormat uses Monday as the first day of week.
+  final val weekBasedLetters = Set('Y', 'W', 'w', 'u', 'e', 'c')
+  final val unsupportedLetters = Set('A', 'n', 'N', 'p')
   // The quarter fields will also be parsed strangely, e.g. when the pattern contains `yMd` and can
   // be directly resolved then the `q` do check for whether the month is valid, but if the date
   // fields is incomplete, e.g. `yM`, the checking will be bypassed.
-  final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F', 'q', 'Q')
+  final val unsupportedLettersForParsing = Set('E', 'F', 'q', 'Q')
   final val unsupportedPatternLengths = {
     // SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
     // 2.4
-    Seq("G", "M", "L", "E", "u", "Q", "q").map(_ * 5) ++
+    Seq("G", "M", "L", "E", "Q", "q").map(_ * 5) ++
       // SPARK-31867: Disable year pattern longer than 10 which will cause Java time library throw
       // unchecked `ArrayIndexOutOfBoundsException` by the `NumberPrinterParser` for formatting. It
       // makes the call side difficult to handle exceptions and easily leads to silent data change
       // because of the exceptions being suppressed.
-      Seq("y", "Y").map(_ * 11)
+      Seq("y").map(_ * 11)
   }.toSet
 
   /**
@@ -260,7 +287,7 @@ private object DateTimeFormatterHelper {
    * @param pattern The input pattern.
    * @return The pattern for new parser
    */
-  def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
+  def convertIncompatiblePattern(pattern: String, isParsing: Boolean): String = {
     val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
       case (patternPart, index) =>
         // Text can be quoted using single quotes, we only check the non-quote parts.
@@ -269,6 +296,10 @@ private object DateTimeFormatterHelper {
     (pattern + " ").split("'").zipWithIndex.map {
       case (patternPart, index) =>
         if (index % 2 == 0) {
+          for (c <- patternPart if weekBasedLetters.contains(c)) {
+            throw new IllegalArgumentException(s"All week-based patterns are unsupported since" +
+              s" Spark 3.0, detected: $c, Please use the SQL function EXTRACT instead")
+          }
           for (c <- patternPart if unsupportedLetters.contains(c) ||
             (isParsing && unsupportedLettersForParsing.contains(c))) {
             throw new IllegalArgumentException(s"Illegal pattern character: $c")
@@ -282,20 +313,13 @@ private object DateTimeFormatterHelper {
               "or upgrade your Java version. For more details, please read " +
               "https://bugs.openjdk.java.net/browse/JDK-8114833")
           }
-          // The meaning of 'u' was day number of week in SimpleDateFormat, it was changed to year
-          // in DateTimeFormatter. Substitute 'u' to 'e' and use DateTimeFormatter to parse the
-          // string. If parsable, return the result; otherwise, fall back to 'u', and then use the
-          // legacy SimpleDateFormat parser to parse. When it is successfully parsed, throw an
-          // exception and ask users to change the pattern strings or turn on the legacy mode;
-          // otherwise, return NULL as what Spark 2.4 does.
-          val res = patternPart.replace("u", "e")
           // In DateTimeFormatter, 'u' supports negative years. We substitute 'y' to 'u' here for
           // keeping the support in Spark 3.0. If parse failed in Spark 3.0, fall back to 'y'.
           // We only do this substitution when there is no era designator found in the pattern.
           if (!eraDesignatorContained) {
-            res.replace("y", "u")
+            patternPart.replace("y", "u")
           } else {
-            res
+            patternPart
           }
         } else {
           patternPart

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -62,11 +62,11 @@ class Iso8601TimestampFormatter(
     zoneId: ZoneId,
     locale: Locale,
     legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
-    needVarLengthSecondFraction: Boolean)
+    isParsing: Boolean)
   extends TimestampFormatter with DateTimeFormatterHelper {
   @transient
   protected lazy val formatter: DateTimeFormatter =
-    getOrCreateFormatter(pattern, locale, needVarLengthSecondFraction)
+    getOrCreateFormatter(pattern, locale, isParsing)
 
   @transient
   protected lazy val legacyFormatter = TimestampFormatter.getLegacyFormatter(
@@ -84,12 +84,15 @@ class Iso8601TimestampFormatter(
         val microsOfSecond = zonedDateTime.get(MICRO_OF_SECOND)
 
         Math.addExact(SECONDS.toMicros(epochSeconds), microsOfSecond)
-      } catch checkDiffResult(s, legacyFormatter.parse)
+      } catch checkParsedDiff(s, legacyFormatter.parse)
     }
   }
 
   override def format(instant: Instant): String = {
-    formatter.withZone(zoneId).format(instant)
+    try {
+      formatter.withZone(zoneId).format(instant)
+    } catch checkFormattedDiff(toJavaTimestamp(instantToMicros(instant)),
+      (t: Timestamp) => format(t))
   }
 
   override def format(us: Long): String = {
@@ -122,7 +125,7 @@ class FractionTimestampFormatter(zoneId: ZoneId)
     zoneId,
     TimestampFormatter.defaultLocale,
     LegacyDateFormats.FAST_DATE_FORMAT,
-    needVarLengthSecondFraction = false) {
+    isParsing = false) {
 
   @transient
   override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
@@ -287,7 +290,7 @@ object TimestampFormatter {
       zoneId: ZoneId,
       locale: Locale = defaultLocale,
       legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
-      isParsing: Boolean = false): TimestampFormatter = {
+      isParsing: Boolean): TimestampFormatter = {
     val pattern = format.getOrElse(defaultPattern)
     if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
       getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
@@ -334,12 +337,12 @@ object TimestampFormatter {
   def apply(
       format: String,
       zoneId: ZoneId,
-      isParsing: Boolean = false): TimestampFormatter = {
+      isParsing: Boolean): TimestampFormatter = {
     getFormatter(Some(format), zoneId, isParsing = isParsing)
   }
 
   def apply(zoneId: ZoneId): TimestampFormatter = {
-    getFormatter(None, zoneId)
+    getFormatter(None, zoneId, isParsing = false)
   }
 
   def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = {

diff --git a/...alyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/...alyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -41,7 +41,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   private val JST_OPT = Option(JST.getId)
 
   def toMillis(timestamp: String): Long = {
-    val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC)
+    val tf = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC, isParsing = true)
     DateTimeUtils.microsToMillis(tf.parse(timestamp))
   }
   val date = "2015-04-08 13:10:15"

diff --git a/...e/spark/sql/util/DateFormatterSuite.scala → ...ql/catalyst/util/DateFormatterSuite.scala b/...e/spark/sql/util/DateFormatterSuite.scala → ...ql/catalyst/util/DateFormatterSuite.scala
@@ -15,20 +15,23 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.util
+package org.apache.spark.sql.catalyst.util
 
 import java.time.{DateTimeException, LocalDate, ZoneId}
 import java.util.{Calendar, TimeZone}
 
-import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
-import org.apache.spark.sql.catalyst.plans.SQLHelper
-import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyDateFormats}
+import org.apache.spark.SparkUpgradeException
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
 
-class DateFormatterSuite extends SparkFunSuite with SQLHelper {
+class DateFormatterSuite extends DatetimeFormatterSuite {
+
+  override def checkFormatterCreation(pattern: String, isParsing: Boolean): Unit = {
+    DateFormatter(pattern, UTC, isParsing)
+  }
+
   private def withOutstandingZoneIds(f: ZoneId => Unit): Unit = {
     for {
       jvmZoneId <- outstandingZoneIds