From f2bc0b26184c02b6019893c601eb479db9419689 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?= Date: Sat, 10 Nov 2018 14:28:01 +0100 Subject: [PATCH 1/3] Fix day of year calculation for Julian calendar days --- .../sql/catalyst/util/DateTimeUtils.scala | 53 +++++++++++++++---- .../catalyst/util/DateTimeUtilsSuite.scala | 24 +++++++++ 2 files changed, 66 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 5ae75dc939303..6060fc54e2ebd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -53,14 +53,27 @@ object DateTimeUtils { final val MILLIS_PER_DAY = SECONDS_PER_DAY * 1000L - // number of days in 400 years + // number of days in 400 years by Gregorian calendar final val daysIn400Years: Int = 146097 + + // In Julian calendar every year which is a multiple of 4 is a leap year. + // So in 400 years there are 3 extra leap years compared to the Gregorian calendar: + // 100, 200 and 300 (as multiples of 100 but not 400). + final val extraLeapDaysIn400YearsJulian = 3 + + // number of days in 400 years by Julian calendar + final val daysIn400YearsInJulian: Int = daysIn400Years + extraLeapDaysIn400YearsJulian + // number of days between 1.1.1970 and 1.1.2001 final val to2001 = -11323 // this is year -17999, calculation: 50 * daysIn400Year final val YearZero = -17999 final val toYearZero = to2001 + 7304850 + + // days to year -17999 in Julian calendar + final val toYearZeroInJulian = toYearZero + 49 * extraLeapDaysIn400YearsJulian + final val TimeZoneGMT = TimeZone.getTimeZone("GMT") final val TimeZoneUTC = TimeZone.getTimeZone("UTC") final val MonthOf31Days = Set(1, 3, 5, 7, 8, 10, 12) @@ -585,20 +598,30 @@ object DateTimeUtils { * Return the number of days since the start of 400 year period. * The second year of a 400 year period (year 1) starts on day 365. */ - private[this] def yearBoundary(year: Int): Int = { - year * 365 + ((year / 4 ) - (year / 100) + (year / 400)) + private[this] def yearBoundary(year: Int, isGregorian: Boolean): Int = { + if (isGregorian) { + year * 365 + ((year / 4) - (year / 100) + (year / 400)) + } else { + year * 365 + (year / 4) + } } /** * Calculates the number of years for the given number of days. This depends * on a 400 year period. * @param days days since the beginning of the 400 year period + * @param isGregorian indicates whether leap years should be calculated according to Gregorian + * (or Julian) calendar * @return (number of year, days in year) */ - private[this] def numYears(days: Int): (Int, Int) = { + private[this] def numYears(days: Int, isGregorian: Boolean): (Int, Int) = { val year = days / 365 - val boundary = yearBoundary(year) - if (days > boundary) (year, days - boundary) else (year - 1, days - yearBoundary(year - 1)) + val boundary = yearBoundary(year, isGregorian) + if (days > boundary) { + (year, days - boundary) + } else { + (year - 1, days - yearBoundary(year - 1, isGregorian)) + } } /** @@ -609,18 +632,26 @@ object DateTimeUtils { * equals to the period 1.1.1601 until 31.12.2000. */ private[this] def getYearAndDayInYear(daysSince1970: SQLDate): (Int, Int) = { - // add the difference (in days) between 1.1.1970 and the artificial year 0 (-17999) - var daysSince1970Tmp = daysSince1970 // Since Julian calendar was replaced with the Gregorian calendar, // the 10 days after Oct. 4 were skipped. // (1582-10-04) -141428 days since 1970-01-01 if (daysSince1970 <= -141428) { - daysSince1970Tmp -= 10 + getYearAndDayInYear(daysSince1970 - 10, toYearZeroInJulian, daysIn400YearsInJulian, false) + } else { + getYearAndDayInYear(daysSince1970, toYearZero, daysIn400Years, true) } - val daysNormalized = daysSince1970Tmp + toYearZero + } + + private def getYearAndDayInYear( + daysSince1970: SQLDate, + toYearZero: SQLDate, + daysIn400Years: SQLDate, + isGregorian: Boolean): (Int, Int) = { + // add the difference (in days) between 1.1.1970 and the artificial year 0 (-17999) + val daysNormalized = daysSince1970 + toYearZero val numOfQuarterCenturies = daysNormalized / daysIn400Years val daysInThis400 = daysNormalized % daysIn400Years + 1 - val (years, dayInYear) = numYears(daysInThis400) + val (years, dayInYear) = numYears(daysInThis400, isGregorian) val year: Int = (2001 - 20000) + 400 * numOfQuarterCenturies + years (year, dayInYear) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 0182eeb171215..e7d30bb545850 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -410,6 +410,30 @@ class DateTimeUtilsSuite extends SparkFunSuite { assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 78) } + test("SPARK-26002: correct day of year calculations for Julian calendar years") { + TimeZone.setDefault(TimeZoneUTC) + val c = Calendar.getInstance(TimeZoneUTC) + c.set(Calendar.MILLISECOND, 0) + (1000 to 1600 by 100).foreach { year => + // January 1 is the 1st day of year. + c.set(year, 0, 1, 0, 0, 0) + assert(getYear(getInUTCDays(c.getTimeInMillis)) === year) + assert(getMonth(getInUTCDays(c.getTimeInMillis)) === 1) + assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 1) + + // March 1 is the 61st day of the year as they are leap years. It is true for + // even the multiples of 100 as before 1582-10-4 the Julian calendar leap year calculation + // is used in which every multiples of 4 are leap years + c.set(year, 2, 1, 0, 0, 0) + assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 61) + assert(getMonth(getInUTCDays(c.getTimeInMillis)) === 3) + + // For non-leap years: + c.set(year + 1, 2, 1, 0, 0, 0) + assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 60) + } + } + test("get year") { val c = Calendar.getInstance() c.set(2015, 2, 18, 0, 0, 0) From 421673a4f6ae8820b57bb1fb92d9fe03dc646cde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?= Date: Tue, 20 Nov 2018 13:40:47 +0100 Subject: [PATCH 2/3] review comments 1.0 --- .../apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index e7d30bb545850..1ede2cab25ac9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -411,8 +411,7 @@ class DateTimeUtilsSuite extends SparkFunSuite { } test("SPARK-26002: correct day of year calculations for Julian calendar years") { - TimeZone.setDefault(TimeZoneUTC) - val c = Calendar.getInstance(TimeZoneUTC) + val c = Calendar.getInstance() c.set(Calendar.MILLISECOND, 0) (1000 to 1600 by 100).foreach { year => // January 1 is the 1st day of year. @@ -432,6 +431,9 @@ class DateTimeUtilsSuite extends SparkFunSuite { c.set(year + 1, 2, 1, 0, 0, 0) assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 60) } + + c.set(1582, 2, 1, 0, 0, 0) + assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 60) } test("get year") { From 300ec3980a2824a71d46362a4ef7c7e1a14f6654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?= Date: Thu, 3 Jan 2019 14:44:32 +0100 Subject: [PATCH 3/3] Add end2end and leap day tests, fix comment --- .../apache/spark/sql/catalyst/util/DateTimeUtils.scala | 9 ++++++--- .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 4 ++++ .../src/test/resources/sql-tests/inputs/datetime.sql | 2 ++ .../test/resources/sql-tests/results/datetime.sql.out | 10 +++++++++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 6060fc54e2ebd..f590c63f80b21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -56,9 +56,12 @@ object DateTimeUtils { // number of days in 400 years by Gregorian calendar final val daysIn400Years: Int = 146097 - // In Julian calendar every year which is a multiple of 4 is a leap year. - // So in 400 years there are 3 extra leap years compared to the Gregorian calendar: - // 100, 200 and 300 (as multiples of 100 but not 400). + // In the Julian calendar every year that is exactly divisible by 4 is a leap year without any + // exception. But in the Gregorian calendar every year that is exactly divisible by four + // is a leap year, except for years that are exactly divisible by 100, but these centurial years + // are leap years if they are exactly divisible by 400. + // So there are 3 extra days in the Julian calendar within a 400 years cycle compared to the + // Gregorian calendar. final val extraLeapDaysIn400YearsJulian = 3 // number of days in 400 years by Julian calendar diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index 1ede2cab25ac9..2cb6110e2c093 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -427,6 +427,10 @@ class DateTimeUtilsSuite extends SparkFunSuite { assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 61) assert(getMonth(getInUTCDays(c.getTimeInMillis)) === 3) + // testing leap day (February 29) in leap years + c.set(year, 1, 29, 0, 0, 0) + assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 60) + // For non-leap years: c.set(year + 1, 2, 1, 0, 0, 0) assert(getDayInYear(getInUTCDays(c.getTimeInMillis)) === 60) diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 547c2bef02b24..8bd8bc2b94b8e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -27,3 +27,5 @@ select current_date = current_date(), current_timestamp = current_timestamp(), a select a, b from ttf2 order by a, current_date; select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), weekday(null), weekday('1582-10-15 13:10:15'); + +select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01'); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 63aa00426ea32..2090633802e26 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 10 +-- Number of queries: 11 -- !query 0 @@ -89,3 +89,11 @@ select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), week struct -- !query 9 output 5 3 5 NULL 4 + + +-- !query 10 +select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01') +-- !query 10 schema +struct +-- !query 10 output +1500 1 1