From 71a660ac8dad869d9ba3b4e206b74f5c44660ee6 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Thu, 10 Aug 2017 12:17:00 +0800 Subject: [PATCH 01/12] [SPARK-21687][SQL] Spark SQL should set createTime for Hive partition --- .../scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index bde9a81c65a4e..be936f428140a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -986,6 +986,7 @@ private[hive] object HiveClientImpl { tpart.setTableName(ht.getTableName) tpart.setValues(partValues.asJava) tpart.setSd(storageDesc) + tpart.setCreateTime((System.currentTimeMillis() / 1000).toInt) new HivePartition(ht, tpart) } From f668ce8837ee553c61687bd03d04cddd32e5f36f Mon Sep 17 00:00:00 2001 From: debugger87 Date: Fri, 11 Aug 2017 15:50:26 +0800 Subject: [PATCH 02/12] added createTime and lastAccessTime into CatalogTablePartition --- .../apache/spark/sql/catalyst/catalog/interface.scala | 2 ++ .../apache/spark/sql/hive/client/HiveClientImpl.scala | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index f86510624aa78..6ba7c851103f3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -97,6 +97,8 @@ object CatalogStorageFormat { case class CatalogTablePartition( spec: CatalogTypes.TablePartitionSpec, storage: CatalogStorageFormat, + createTime: Long = System.currentTimeMillis, + lastAccessTime: Long = -1, parameters: Map[String, String] = Map.empty) { def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index be936f428140a..a7e15f0e43f96 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -986,7 +986,8 @@ private[hive] object HiveClientImpl { tpart.setTableName(ht.getTableName) tpart.setValues(partValues.asJava) tpart.setSd(storageDesc) - tpart.setCreateTime((System.currentTimeMillis() / 1000).toInt) + tpart.setCreateTime((p.createTime / 1000).toInt) + tpart.setLastAccessTime((p.lastAccessTime / 1000).toInt) new HivePartition(ht, tpart) } @@ -1005,8 +1006,10 @@ private[hive] object HiveClientImpl { compressed = apiPartition.getSd.isCompressed, properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) .map(_.asScala.toMap).orNull), - parameters = - if (hp.getParameters() != null) hp.getParameters().asScala.toMap else Map.empty) + createTime = apiPartition.getCreateTime * 1000, + lastAccessTime = apiPartition.getLastAccessTime * 1000, + parameters = + if (hp.getParameters() != null) hp.getParameters().asScala.toMap else Map.empty) } // Below is the key of table properties for storing Hive-generated statistics From 2fb1ddabdb2ab8f7b585ee7aea93280f96a23467 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Fri, 11 Aug 2017 15:54:26 +0800 Subject: [PATCH 03/12] minor tweak --- .../org/apache/spark/sql/hive/client/HiveClientImpl.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index a7e15f0e43f96..11bf77b2ff9ff 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -1006,8 +1006,8 @@ private[hive] object HiveClientImpl { compressed = apiPartition.getSd.isCompressed, properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) .map(_.asScala.toMap).orNull), - createTime = apiPartition.getCreateTime * 1000, - lastAccessTime = apiPartition.getLastAccessTime * 1000, + createTime = apiPartition.getCreateTime.toLong * 1000, + lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000, parameters = if (hp.getParameters() != null) hp.getParameters().asScala.toMap else Map.empty) } From c833ce7aa5f2ba0b684494fd1b24b7995f1c09c9 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Fri, 11 Aug 2017 16:07:57 +0800 Subject: [PATCH 04/12] fix type missmatch --- .../org/apache/spark/sql/catalyst/catalog/interface.scala | 4 ++-- .../org/apache/spark/sql/hive/client/HiveClientImpl.scala | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 6ba7c851103f3..3e8c4543ee742 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -97,9 +97,9 @@ object CatalogStorageFormat { case class CatalogTablePartition( spec: CatalogTypes.TablePartitionSpec, storage: CatalogStorageFormat, + parameters: Map[String, String] = Map.empty, createTime: Long = System.currentTimeMillis, - lastAccessTime: Long = -1, - parameters: Map[String, String] = Map.empty) { + lastAccessTime: Long = -1) { def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { val map = new mutable.LinkedHashMap[String, String]() diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 11bf77b2ff9ff..5423e96d49e7e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -1006,10 +1006,10 @@ private[hive] object HiveClientImpl { compressed = apiPartition.getSd.isCompressed, properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) .map(_.asScala.toMap).orNull), - createTime = apiPartition.getCreateTime.toLong * 1000, - lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000, parameters = - if (hp.getParameters() != null) hp.getParameters().asScala.toMap else Map.empty) + if (hp.getParameters() != null) hp.getParameters().asScala.toMap else Map.empty, + createTime = apiPartition.getCreateTime.toLong * 1000, + lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000) } // Below is the key of table properties for storing Hive-generated statistics From bf2a1052f807a7ae36004c819e66fff5c4b45820 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Sat, 12 Aug 2017 07:26:29 +0800 Subject: [PATCH 05/12] added createTime and lastAccessTime into partition map for display --- .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 3e8c4543ee742..9d99b70532263 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -109,6 +109,8 @@ case class CatalogTablePartition( if (parameters.nonEmpty) { map.put("Partition Parameters", s"{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}") } + map.put("Created Time", new Date(createTime).toString) + map.put("Last Access", new Date(lastAccessTime).toString) map } From a00e943a7097f386c842fd725cb1474e3a7f74c8 Mon Sep 17 00:00:00 2001 From: Chaozhong Yang Date: Thu, 7 Jun 2018 11:42:52 +0800 Subject: [PATCH 06/12] bug fix --- .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 8761b8046acad..d4e585cc53c5e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -100,7 +100,7 @@ case class CatalogTablePartition( storage: CatalogStorageFormat, parameters: Map[String, String] = Map.empty, createTime: Long = System.currentTimeMillis, - lastAccessTime: Long = -1) { + lastAccessTime: Long = -1, stats: Option[CatalogStatistics] = None) { def toLinkedHashMap: mutable.LinkedHashMap[String, String] = { From e3a0cc43b10828b8111f7cd9523391cd3a2fdb6f Mon Sep 17 00:00:00 2001 From: Chaozhong Yang Date: Thu, 7 Jun 2018 11:57:06 +0800 Subject: [PATCH 07/12] added some comments --- .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index d4e585cc53c5e..46407a94d2262 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -93,6 +93,8 @@ object CatalogStorageFormat { * @param spec partition spec values indexed by column name * @param storage storage format of the partition * @param parameters some parameters for the partition + * @param createTime creation time of the partition + * @param lastAccessTime last access time * @param stats optional statistics (number of rows, total size, etc.) */ case class CatalogTablePartition( From b0846c39a94d729ec0324cc72b98861da7c073c7 Mon Sep 17 00:00:00 2001 From: Chaozhong Yang Date: Thu, 7 Jun 2018 20:40:10 +0800 Subject: [PATCH 08/12] fix missing comma --- .../scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 60aea7716cf35..1df46d7431a21 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -1022,7 +1022,7 @@ private[hive] object HiveClientImpl { properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) .map(_.asScala.toMap).orNull), createTime = apiPartition.getCreateTime.toLong * 1000, - lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000) + lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000, parameters = properties, stats = readHiveStats(properties)) } From 0390e88e7e16a5200041978073f263994decc6f6 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Fri, 22 Jun 2018 21:33:22 +0800 Subject: [PATCH 09/12] ignore createTime and lastAccessTime in catalogPartitionsEqual --- .../spark/sql/catalyst/catalog/SessionCatalogSuite.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala index 6abab0073cca3..6a7375ee186fa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala @@ -1114,11 +1114,13 @@ abstract class SessionCatalogSuite extends AnalysisTest { // And for hive serde table, hive metastore will set some values(e.g.transient_lastDdlTime) // in table's parameters and storage's properties, here we also ignore them. val actualPartsNormalize = actualParts.map(p => - p.copy(parameters = Map.empty, storage = p.storage.copy( + p.copy(parameters = Map.empty, createTime = -1, lastAccessTime = -1, + storage = p.storage.copy( properties = Map.empty, locationUri = None, serde = None))).toSet val expectedPartsNormalize = expectedParts.map(p => - p.copy(parameters = Map.empty, storage = p.storage.copy( + p.copy(parameters = Map.empty, createTime = -1, lastAccessTime = -1, + storage = p.storage.copy( properties = Map.empty, locationUri = None, serde = None))).toSet actualPartsNormalize == expectedPartsNormalize From c843ef18c4073281c8e8e59abf17a7bcbc6c5560 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Sat, 23 Jun 2018 01:14:10 +0800 Subject: [PATCH 10/12] fix sql output --- .../scala/org/apache/spark/sql/catalyst/catalog/interface.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 46407a94d2262..70cea836885c9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -113,8 +113,6 @@ case class CatalogTablePartition( if (parameters.nonEmpty) { map.put("Partition Parameters", s"{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}") } - map.put("Created Time", new Date(createTime).toString) - map.put("Last Access", new Date(lastAccessTime).toString) stats.foreach(s => map.put("Partition Statistics", s.simpleString)) map } From 715c7ccee05cf85a4e6a771dabd0799a48b559a3 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Wed, 27 Jun 2018 14:59:00 +0800 Subject: [PATCH 11/12] regenerate output files for SQLQueryTestSuite after added Created Time and Last Access into CatalogTablePartition.toLinkedHashMap --- .../spark/sql/catalyst/catalog/interface.scala | 2 ++ .../results/describe-part-after-analyze.sql.out | 14 ++++++++++++++ .../resources/sql-tests/results/describe.sql.out | 4 ++++ .../sql-tests/results/show-tables.sql.out | 2 ++ 4 files changed, 22 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 70cea836885c9..46407a94d2262 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -113,6 +113,8 @@ case class CatalogTablePartition( if (parameters.nonEmpty) { map.put("Partition Parameters", s"{${parameters.map(p => p._1 + "=" + p._2).mkString(", ")}}") } + map.put("Created Time", new Date(createTime).toString) + map.put("Last Access", new Date(lastAccessTime).toString) stats.foreach(s => map.put("Partition Statistics", s.simpleString)) map } diff --git a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out index 58ed201e2a60f..8ba69c698b551 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out @@ -57,6 +57,8 @@ Database default Table t Partition Values [ds=2017-08-01, hr=10] Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Created Time [not included in comparison] +Last Access [not included in comparison] # Storage Information Location [not included in comparison]sql/core/spark-warehouse/t @@ -89,6 +91,8 @@ Database default Table t Partition Values [ds=2017-08-01, hr=10] Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Created Time [not included in comparison] +Last Access [not included in comparison] Partition Statistics 1121 bytes, 3 rows # Storage Information @@ -122,6 +126,8 @@ Database default Table t Partition Values [ds=2017-08-01, hr=10] Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Created Time [not included in comparison] +Last Access [not included in comparison] Partition Statistics 1121 bytes, 3 rows # Storage Information @@ -147,6 +153,8 @@ Database default Table t Partition Values [ds=2017-08-01, hr=11] Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11 +Created Time [not included in comparison] +Last Access [not included in comparison] Partition Statistics 1098 bytes, 4 rows # Storage Information @@ -180,6 +188,8 @@ Database default Table t Partition Values [ds=2017-08-01, hr=10] Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Created Time [not included in comparison] +Last Access [not included in comparison] Partition Statistics 1121 bytes, 3 rows # Storage Information @@ -205,6 +215,8 @@ Database default Table t Partition Values [ds=2017-08-01, hr=11] Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11 +Created Time [not included in comparison] +Last Access [not included in comparison] Partition Statistics 1098 bytes, 4 rows # Storage Information @@ -230,6 +242,8 @@ Database default Table t Partition Values [ds=2017-09-01, hr=5] Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-09-01/hr=5 +Created Time [not included in comparison] +Last Access [not included in comparison] Partition Statistics 1144 bytes, 2 rows # Storage Information diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 8c908b7625056..79390cb424444 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -282,6 +282,8 @@ Table t Partition Values [c=Us, d=1] Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1 Storage Properties [a=1, b=2] +Created Time [not included in comparison] +Last Access [not included in comparison] # Storage Information Num Buckets 2 @@ -311,6 +313,8 @@ Table t Partition Values [c=Us, d=1] Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1 Storage Properties [a=1, b=2] +Created Time [not included in comparison] +Last Access [not included in comparison] # Storage Information Num Buckets 2 diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out index 975bb06124744..abeb7e18f031e 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out @@ -178,6 +178,8 @@ struct -- !query 14 output showdb show_t1 false Partition Values: [c=Us, d=1] Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t1/c=Us/d=1 +Created Time [not included in comparison] +Last Access [not included in comparison] -- !query 15 From 18c85b61139e2b9d434214b9082b43a46e1c8787 Mon Sep 17 00:00:00 2001 From: debugger87 Date: Wed, 27 Jun 2018 16:18:03 +0800 Subject: [PATCH 12/12] fix comment --- .../org/apache/spark/sql/catalyst/catalog/interface.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 46407a94d2262..c6105c5526049 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -93,8 +93,8 @@ object CatalogStorageFormat { * @param spec partition spec values indexed by column name * @param storage storage format of the partition * @param parameters some parameters for the partition - * @param createTime creation time of the partition - * @param lastAccessTime last access time + * @param createTime creation time of the partition, in milliseconds + * @param lastAccessTime last access time, in milliseconds * @param stats optional statistics (number of rows, total size, etc.) */ case class CatalogTablePartition(