From c940381a0be36fd227e8f63caf32d3be86c5aa69 Mon Sep 17 00:00:00 2001 From: Sharad Sonker Date: Thu, 7 Jun 2018 10:26:37 +0530 Subject: [PATCH] [SPARK-24457][SQL] Improving performance of stringToTimestamp by caching Calendar instances for input timezones instead of creating new everytime --- .../sql/catalyst/util/DateTimeUtils.scala | 24 +++++++-- .../sql/StringToTimestampBenchmark.scala | 53 +++++++++++++++++++ 2 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/StringToTimestampBenchmark.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 80f15053005f..688011d958fc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -28,6 +28,8 @@ import scala.annotation.tailrec import org.apache.spark.unsafe.types.UTF8String +import scala.collection.mutable + /** * Helper functions for converting between internal and external date and time representations. * Dates are exposed externally as java.sql.Date and are represented internally as the number of @@ -111,6 +113,22 @@ object DateTimeUtils { computedTimeZones.computeIfAbsent(timeZoneId, computeTimeZone) } + private val threadLocalComputedCalendarsMap = + new ThreadLocal[mutable.Map[TimeZone, Calendar]] { + override def initialValue(): mutable.Map[TimeZone, Calendar] = { + mutable.Map[TimeZone, Calendar]() + } + } + + def getCalendar(timeZone: TimeZone): Calendar = { + val c = threadLocalComputedCalendarsMap.get() + .getOrElseUpdate(timeZone, { + Calendar.getInstance(timeZone) + }) + c.setTimeInMillis(System.currentTimeMillis()) + c + } + def newDateFormat(formatString: String, timeZone: TimeZone): DateFormat = { val sdf = new SimpleDateFormat(formatString, Locale.US) sdf.setTimeZone(timeZone) @@ -438,10 +456,9 @@ object DateTimeUtils { if (tz.isDefined && rejectTzInString) return None val c = if (tz.isEmpty) { - Calendar.getInstance(timeZone) + getCalendar(timeZone) } else { - Calendar.getInstance( - getTimeZone(f"GMT${tz.get.toChar}${segments(7)}%02d:${segments(8)}%02d")) + getCalendar(getTimeZone(f"GMT${tz.get.toChar}${segments(7)}%02d:${segments(8)}%02d")) } c.set(Calendar.MILLISECOND, 0) @@ -1162,5 +1179,6 @@ object DateTimeUtils { threadLocalGmtCalendar.remove() threadLocalTimestampFormat.remove() threadLocalDateFormat.remove() + threadLocalComputedCalendarsMap.remove() } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/StringToTimestampBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/StringToTimestampBenchmark.scala new file mode 100644 index 000000000000..30f7cc417cce --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/StringToTimestampBenchmark.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.Calendar + +import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils} +import org.apache.spark.util.Benchmark + +object StringToTimestampBenchmark { + + def main(args: Array[String]): Unit = { + + val len = 100000 + val benchmark = new Benchmark("string to timestamp", len) + + val tzs = Seq.fill(len)(scala.util.Random.nextInt(DateTimeTestUtils.ALL_TIMEZONES.length)) + .map(DateTimeTestUtils.ALL_TIMEZONES(_)) + + benchmark.addCase("Creating calendar instance on each call") { _ => + tzs.foreach(Calendar.getInstance) + } + + benchmark.addCase("Caching calendar instance") { _ => + tzs.foreach(DateTimeUtils.getCalendar) + } + + /* + Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz + + string to timestamp: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Creating calendar instance on each call 20 / 21 5.1 195.0 1.0X + Caching calendar instance 8 / 8 12.7 78.6 2.5X + */ + benchmark.run() + } +}