From 120662e4e2834cf3c191357a29da56b81b7b9c05 Mon Sep 17 00:00:00 2001 From: lianhuiwang Date: Sun, 11 Jun 2017 16:46:07 +0800 Subject: [PATCH] address comments. --- .../datasources/PruneFileSourcePartitions.scala | 1 + .../execution/PruneFileSourcePartitionsSuite.scala | 12 ++++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala index 6dbb46ec37585..f5df1848a38c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala @@ -60,6 +60,7 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq) val prunedFsRelation = fsRelation.copy(location = prunedFileIndex)(sparkSession) + // Change table stats based on the sizeInBytes of pruned files val withStats = logicalRelation.catalogTable.map(_.copy( stats = Some(CatalogStatistics(sizeInBytes = BigInt(prunedFileIndex.sizeInBytes))))) val prunedLogicalRelation = logicalRelation.copy( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala index 3e31231e6beb7..4f2ecf228d26a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} @@ -81,6 +82,13 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te """.stripMargin) } + val tableName = "partTbl" + sql(s"analyze table partTbl compute STATISTICS") + + val tableStats = + spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats + assert(tableStats.isDefined && tableStats.get.sizeInBytes > 0, "tableStats is lost") + withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") { val df = sql("SELECT * FROM partTbl where part = 1") val query = df.queryExecution.analyzed.analyze @@ -88,12 +96,12 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te case relation: LogicalRelation => relation.computeStats(conf).sizeInBytes } assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}") - assert(sizes1(0) > 5000, s"expected > 5000 for test table 'src', got: ${sizes1(0)}") + assert(sizes1(0) == tableStats.get.sizeInBytes) val sizes2 = Optimize.execute(query).collect { case relation: LogicalRelation => relation.computeStats(conf).sizeInBytes } assert(sizes2.size === 1, s"Size wrong for:\n ${df.queryExecution}") - assert(sizes2(0) < 5000, s"expected < 5000 for test table 'src', got: ${sizes2(0)}") + assert(sizes2(0) < tableStats.get.sizeInBytes) } } }