From d297817b7457fef40eb78b803542aed213afb7fc Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 5 Nov 2018 13:31:22 +0800 Subject: [PATCH 1/3] trim() the string when cast stringToTimestamp and stringToDate --- .../apache/spark/sql/catalyst/expressions/Cast.scala | 8 ++++---- .../spark/sql/catalyst/expressions/CastSuite.scala | 12 +++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index ee463bf5eb6ac..23a05a0fa2d4d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -359,7 +359,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String // TimestampConverter private[this] def castToTimestamp(from: DataType): Any => Any = from match { case StringType => - buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs, timeZone).orNull) + buildCast[UTF8String](_, s => DateTimeUtils.stringToTimestamp(s.trim(), timeZone).orNull) case BooleanType => buildCast[Boolean](_, b => if (b) 1L else 0) case LongType => @@ -402,7 +402,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String // DateConverter private[this] def castToDate(from: DataType): Any => Any = from match { case StringType => - buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s).orNull) + buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s.trim()).orNull) case TimestampType => // throw valid precision more than seconds, according to Hive. // Timestamp.nanos is in 0 to 999,999,999, no more than a second. @@ -907,7 +907,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String val intOpt = ctx.freshVariable("intOpt", classOf[Option[Integer]]) (c, evPrim, evNull) => code""" scala.Option $intOpt = - org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c); + org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c.trim()); if ($intOpt.isDefined()) { $evPrim = ((Integer) $intOpt.get()).intValue(); } else { @@ -1010,7 +1010,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String (c, evPrim, evNull) => code""" scala.Option $longOpt = - org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c, $tz); + org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c.trim(), $tz); if ($longOpt.isDefined()) { $evPrim = ((Long) $longOpt.get()).longValue(); } else { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 94dee7ea048c3..9d59e42b96046 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -98,8 +98,10 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper { c = Calendar.getInstance() c.set(2015, 2, 18, 0, 0, 0) c.set(Calendar.MILLISECOND, 0) - checkEvaluation(Cast(Literal("2015-03-18"), DateType), new Date(c.getTimeInMillis)) - checkEvaluation(Cast(Literal("2015-03-18 "), DateType), new Date(c.getTimeInMillis)) + + Seq("2015-03-18", " 2015-03-18", "2015-03-18 ", " 2015-03-18 ").foreach { s => + checkEvaluation(Cast(Literal(s), DateType), new Date(c.getTimeInMillis)) + } checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis)) checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis)) checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis)) @@ -130,9 +132,9 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper { c = Calendar.getInstance(tz) c.set(2015, 2, 18, 0, 0, 0) c.set(Calendar.MILLISECOND, 0) - checkCastStringToTimestamp("2015-03-18", new Timestamp(c.getTimeInMillis)) - checkCastStringToTimestamp("2015-03-18 ", new Timestamp(c.getTimeInMillis)) - checkCastStringToTimestamp("2015-03-18T", new Timestamp(c.getTimeInMillis)) + Seq("2015-03-18", " 2015-03-18", "2015-03-18 ", " 2015-03-18 ", "2015-03-18T").foreach { s => + checkCastStringToTimestamp(s, new Timestamp(c.getTimeInMillis)) + } c = Calendar.getInstance(tz) c.set(2015, 2, 18, 12, 3, 17) From 5090d52868dcdd38ae76587d85225b9f1675c6f0 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 6 Nov 2018 18:01:32 +0800 Subject: [PATCH 2/3] trim stringToTimestamp and stringToDate --- .../spark/sql/catalyst/expressions/Cast.scala | 8 +++---- .../sql/catalyst/util/DateTimeUtils.scala | 8 +++---- .../sql/catalyst/expressions/CastSuite.scala | 12 +++++------ .../catalyst/util/DateTimeUtilsSuite.scala | 21 +++++++------------ 4 files changed, 21 insertions(+), 28 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 23a05a0fa2d4d..ee463bf5eb6ac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -359,7 +359,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String // TimestampConverter private[this] def castToTimestamp(from: DataType): Any => Any = from match { case StringType => - buildCast[UTF8String](_, s => DateTimeUtils.stringToTimestamp(s.trim(), timeZone).orNull) + buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs, timeZone).orNull) case BooleanType => buildCast[Boolean](_, b => if (b) 1L else 0) case LongType => @@ -402,7 +402,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String // DateConverter private[this] def castToDate(from: DataType): Any => Any = from match { case StringType => - buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s.trim()).orNull) + buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s).orNull) case TimestampType => // throw valid precision more than seconds, according to Hive. // Timestamp.nanos is in 0 to 999,999,999, no more than a second. @@ -907,7 +907,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String val intOpt = ctx.freshVariable("intOpt", classOf[Option[Integer]]) (c, evPrim, evNull) => code""" scala.Option $intOpt = - org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c.trim()); + org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToDate($c); if ($intOpt.isDefined()) { $evPrim = ((Integer) $intOpt.get()).intValue(); } else { @@ -1010,7 +1010,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String (c, evPrim, evNull) => code""" scala.Option $longOpt = - org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c.trim(), $tz); + org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c, $tz); if ($longOpt.isDefined()) { $evPrim = ((Long) $longOpt.get()).longValue(); } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 81d7274607ac8..6caf5b065ac76 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -274,7 +274,7 @@ object DateTimeUtils { } /** - * Parses a given UTF8 date string to the corresponding a corresponding [[Long]] value. + * Parses a trimmed UTF8 date string to the corresponding a corresponding [[Long]] value. * The return type is [[Option]] in order to distinguish between 0L and null. The following * formats are allowed: * @@ -311,7 +311,7 @@ object DateTimeUtils { val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0) var i = 0 var currentSegmentValue = 0 - val bytes = s.getBytes + val bytes = s.trim.getBytes var j = 0 var digitsMilli = 0 var justTime = false @@ -441,7 +441,7 @@ object DateTimeUtils { } /** - * Parses a given UTF8 date string to a corresponding [[Int]] value. + * Parses a trimmed UTF8 date string to a corresponding [[Int]] value. * The return type is [[Option]] in order to distinguish between 0 and null. The following * formats are allowed: * @@ -459,7 +459,7 @@ object DateTimeUtils { val segments: Array[Int] = Array[Int](1, 1, 1) var i = 0 var currentSegmentValue = 0 - val bytes = s.getBytes + val bytes = s.trim.getBytes var j = 0 while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) { val b = bytes(j) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 9d59e42b96046..94dee7ea048c3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -98,10 +98,8 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper { c = Calendar.getInstance() c.set(2015, 2, 18, 0, 0, 0) c.set(Calendar.MILLISECOND, 0) - - Seq("2015-03-18", " 2015-03-18", "2015-03-18 ", " 2015-03-18 ").foreach { s => - checkEvaluation(Cast(Literal(s), DateType), new Date(c.getTimeInMillis)) - } + checkEvaluation(Cast(Literal("2015-03-18"), DateType), new Date(c.getTimeInMillis)) + checkEvaluation(Cast(Literal("2015-03-18 "), DateType), new Date(c.getTimeInMillis)) checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis)) checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis)) checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis)) @@ -132,9 +130,9 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper { c = Calendar.getInstance(tz) c.set(2015, 2, 18, 0, 0, 0) c.set(Calendar.MILLISECOND, 0) - Seq("2015-03-18", " 2015-03-18", "2015-03-18 ", " 2015-03-18 ", "2015-03-18T").foreach { s => - checkCastStringToTimestamp(s, new Timestamp(c.getTimeInMillis)) - } + checkCastStringToTimestamp("2015-03-18", new Timestamp(c.getTimeInMillis)) + checkCastStringToTimestamp("2015-03-18 ", new Timestamp(c.getTimeInMillis)) + checkCastStringToTimestamp("2015-03-18T", new Timestamp(c.getTimeInMillis)) c = Calendar.getInstance(tz) c.set(2015, 2, 18, 12, 3, 17) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 2423668392231..0182eeb171215 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -140,16 +140,10 @@ class DateTimeUtilsSuite extends SparkFunSuite { c = Calendar.getInstance() c.set(2015, 2, 18, 0, 0, 0) c.set(Calendar.MILLISECOND, 0) - assert(stringToDate(UTF8String.fromString("2015-03-18")).get === - millisToDays(c.getTimeInMillis)) - assert(stringToDate(UTF8String.fromString("2015-03-18 ")).get === - millisToDays(c.getTimeInMillis)) - assert(stringToDate(UTF8String.fromString("2015-03-18 123142")).get === - millisToDays(c.getTimeInMillis)) - assert(stringToDate(UTF8String.fromString("2015-03-18T123123")).get === - millisToDays(c.getTimeInMillis)) - assert(stringToDate(UTF8String.fromString("2015-03-18T")).get === - millisToDays(c.getTimeInMillis)) + Seq("2015-03-18", "2015-03-18 ", " 2015-03-18", " 2015-03-18 ", "2015-03-18 123142", + "2015-03-18T123123", "2015-03-18T").foreach { s => + assert(stringToDate(UTF8String.fromString(s)).get === millisToDays(c.getTimeInMillis)) + } assert(stringToDate(UTF8String.fromString("2015-03-18X")).isEmpty) assert(stringToDate(UTF8String.fromString("2015/03/18")).isEmpty) @@ -214,9 +208,10 @@ class DateTimeUtilsSuite extends SparkFunSuite { c = Calendar.getInstance(tz) c.set(2015, 2, 18, 0, 0, 0) c.set(Calendar.MILLISECOND, 0) - checkStringToTimestamp("2015-03-18", Option(c.getTimeInMillis * 1000)) - checkStringToTimestamp("2015-03-18 ", Option(c.getTimeInMillis * 1000)) - checkStringToTimestamp("2015-03-18T", Option(c.getTimeInMillis * 1000)) + + Seq("2015-03-18", "2015-03-18 ", " 2015-03-18", " 2015-03-18 ", "2015-03-18T").foreach { s => + checkStringToTimestamp(s, Option(c.getTimeInMillis * 1000)) + } c = Calendar.getInstance(tz) c.set(2015, 2, 18, 12, 3, 17) From b866d65c534d016f814946236b55ff05f79a4490 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Wed, 7 Nov 2018 07:58:10 +0800 Subject: [PATCH 3/3] Parses a trimmed UTF8 -> Trim and parse a given UTF8 --- .../org/apache/spark/sql/catalyst/util/DateTimeUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 6caf5b065ac76..5ae75dc939303 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -274,7 +274,7 @@ object DateTimeUtils { } /** - * Parses a trimmed UTF8 date string to the corresponding a corresponding [[Long]] value. + * Trim and parse a given UTF8 date string to the corresponding a corresponding [[Long]] value. * The return type is [[Option]] in order to distinguish between 0L and null. The following * formats are allowed: * @@ -441,7 +441,7 @@ object DateTimeUtils { } /** - * Parses a trimmed UTF8 date string to a corresponding [[Int]] value. + * Trim and parse a given UTF8 date string to a corresponding [[Int]] value. * The return type is [[Option]] in order to distinguish between 0 and null. The following * formats are allowed: *