diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index ba48facff2933..918459fe7c246 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -681,9 +681,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat } } - // construct Spark's statistics from information in Hive metastore + // Restore Spark's statistics from information in Metastore. val statsProps = table.properties.filterKeys(_.startsWith(STATISTICS_PREFIX)) + // Currently we have two sources of statistics: one from Hive and the other from Spark. + // In our design, if Spark's statistics is available, we respect it over Hive's statistics. if (statsProps.nonEmpty) { val colStats = new mutable.HashMap[String, ColumnStat] diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index b970be740ab51..be024adac8eb0 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -434,6 +434,8 @@ private[hive] class HiveClientImpl( } val comment = properties.get("comment") + // Here we are reading statistics from Hive. + // Note that this statistics could be overridden by Spark's statistics if that's available. val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_)) val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_)) val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_)).filter(_ >= 0)