Skip to content

Commit

Permalink
[SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive
Browse files Browse the repository at this point in the history
We misunderstood the Julian days and nanoseconds of the day in parquet (as TimestampType) from Hive/Impala, they are overlapped, so can't be added together directly.

In order to avoid the confusing rounding when do the converting, we use `2440588` as the Julian Day of epoch of unix timestamp (which should be 2440587.5).

Author: Davies Liu <davies@databricks.com>
Author: Cheng Lian <lian@databricks.com>

Closes #8400 from davies/timestamp_parquet.

(cherry picked from commit 2f493f7)
Signed-off-by: Cheng Lian <lian@databricks.com>
  • Loading branch information
Davies Liu authored and liancheng committed Aug 25, 2015
1 parent 2032d66 commit e5cea56
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ object DateTimeUtils {
type SQLTimestamp = Long

// see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian
final val JULIAN_DAY_OF_EPOCH = 2440587 // and .5
// it's 2440587.5, rounding up to compatible with Hive
final val JULIAN_DAY_OF_EPOCH = 2440588
final val SECONDS_PER_DAY = 60 * 60 * 24L
final val MICROS_PER_SECOND = 1000L * 1000L
final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
Expand Down Expand Up @@ -183,15 +184,15 @@ object DateTimeUtils {
*/
def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = {
// use Long to avoid rounding errors
val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - SECONDS_PER_DAY / 2
val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY
seconds * MICROS_PER_SECOND + nanoseconds / 1000L
}

/**
* Returns Julian day and nanoseconds in a day from the number of microseconds
*/
def toJulianDay(us: SQLTimestamp): (Int, Long) = {
val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2
val seconds = us / MICROS_PER_SECOND
val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
val secondsInDay = seconds % SECONDS_PER_DAY
val nanos = (us % MICROS_PER_SECOND) * 1000L
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,18 @@ class DateTimeUtilsSuite extends SparkFunSuite {
test("us and julian day") {
val (d, ns) = toJulianDay(0)
assert(d === JULIAN_DAY_OF_EPOCH)
assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND)
assert(ns === 0)
assert(fromJulianDay(d, ns) == 0L)

val t = new Timestamp(61394778610000L) // (2015, 6, 11, 10, 10, 10, 100)
val t = Timestamp.valueOf("2015-06-11 10:10:10.100")
val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
val t2 = toJavaTimestamp(fromJulianDay(d1, ns1))
assert(t.equals(t2))
val t1 = toJavaTimestamp(fromJulianDay(d1, ns1))
assert(t.equals(t1))

val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100")
val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2))
val t22 = toJavaTimestamp(fromJulianDay(d2, ns2))
assert(t2.equals(t22))
}

test("SPARK-6785: java date conversion before and after epoch") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with Before
"BOOLEAN", "TINYINT", "SMALLINT", "INT", "BIGINT", "FLOAT", "DOUBLE", "STRING")
}

ignore("SPARK-10177 timestamp") {
test("SPARK-10177 timestamp") {
testParquetHiveCompatibility(Row(Timestamp.valueOf("2015-08-24 00:31:00")), "TIMESTAMP")
}

Expand Down

0 comments on commit e5cea56

Please sign in to comment.