apache · lianhuiwang · Jun 5, 2017 · Jun 6, 2017 · Jun 11, 2017 · Jun 12, 2017
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import org.apache.spark.sql.catalyst.catalog.CatalogStatistics
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
@@ -59,8 +60,11 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq)
         val prunedFsRelation =
           fsRelation.copy(location = prunedFileIndex)(sparkSession)
-        val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
-
+        // Change table stats based on the sizeInBytes of pruned files
+        val withStats = logicalRelation.catalogTable.map(_.copy(
+          stats = Some(CatalogStatistics(sizeInBytes = BigInt(prunedFileIndex.sizeInBytes)))))
+        val prunedLogicalRelation = logicalRelation.copy(
+          relation = prunedFsRelation, catalogTable = withStats)
         // Keep partition-pruning predicates so that they are visible in physical planning
         val filterExpression = filters.reduceLeft(And)
         val filter = Filter(filterExpression, prunedLogicalRelation)

diff --git a/...e/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/...e/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -18,13 +18,15 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StructType
 
@@ -66,4 +68,42 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
       }
     }
   }
+
+  test("SPARK-20986 Reset table's statistics after PruneFileSourcePartitions rule") {
+    withTempView("tempTbl") {
+      withTable("partTbl") {
+        spark.range(1000).selectExpr("id").createOrReplaceTempView("tempTbl")
+        sql("CREATE TABLE partTbl (id INT) PARTITIONED BY (part INT) STORED AS parquet")
+        for (part <- Seq(1, 2, 3)) {
+          sql(
+            s"""
+               |INSERT OVERWRITE TABLE partTbl PARTITION (part='$part')
+               |select id from tempTbl
+            """.stripMargin)
+        }
+
+        val tableName = "partTbl"
+        sql(s"analyze table partTbl compute STATISTICS")
+
+        val tableStats =
+          spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)).stats
+        assert(tableStats.isDefined && tableStats.get.sizeInBytes > 0, "tableStats is lost")
+
+        withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") {
+          val df = sql("SELECT * FROM partTbl where part = 1")
+          val query = df.queryExecution.analyzed.analyze
+          val sizes1 = query.collect {
+            case relation: LogicalRelation => relation.computeStats(conf).sizeInBytes
+          }
+          assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+          assert(sizes1(0) == tableStats.get.sizeInBytes)
+          val sizes2 = Optimize.execute(query).collect {
+            case relation: LogicalRelation => relation.computeStats(conf).sizeInBytes
+          }
+          assert(sizes2.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+          assert(sizes2(0) < tableStats.get.sizeInBytes)
+        }
+      }
+    }
+  }
 }