apache · CodingCat · Mar 7, 2016 · Mar 7, 2016 · Dec 1, 2017 · Dec 2, 2017
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -94,14 +94,16 @@ class CacheManager extends Logging {
       logWarning("Asked to cache already cached data.")
     } else {
       val sparkSession = query.sparkSession
-      cachedData.add(CachedData(
-        planToCache,
-        InMemoryRelation(
-          sparkSession.sessionState.conf.useCompression,
-          sparkSession.sessionState.conf.columnBatchSize,
-          storageLevel,
-          sparkSession.sessionState.executePlan(planToCache).executedPlan,
-          tableName)))
+      val inMemoryRelation = InMemoryRelation(
+        sparkSession.sessionState.conf.useCompression,
+        sparkSession.sessionState.conf.columnBatchSize,
+        storageLevel,
+        sparkSession.sessionState.executePlan(planToCache).executedPlan,
+        tableName)
+      if (planToCache.conf.cboEnabled && planToCache.stats.rowCount.isDefined) {
 override def computeStats(): Statistics = { 
   catalogTable 
     .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled))) 
     .getOrElse(Statistics(sizeInBytes = relation.sizeInBytes)) 
 } 
 def toPlanStats(planOutput: Seq[Attribute], cboEnabled: Boolean): Statistics = { 
   if (cboEnabled && rowCount.isDefined) { 
     val attrStats = AttributeMap(planOutput.flatMap(a => colStats.get(a.name).map(a -> _))) 
     // Estimate size as number of rows * row size. 
     val size = EstimationUtils.getOutputSize(planOutput, rowCount.get, attrStats) 
     Statistics(sizeInBytes = size, rowCount = rowCount, attributeStats = attrStats) 
   } else { 
     // When CBO is disabled or the table doesn't have other statistics, we apply the size-only 
     // estimation strategy and only propagate sizeInBytes in statistics. 
     Statistics(sizeInBytes = sizeInBytes) 
   } 
 } 
 override def sizeInBytes: Long = location.sizeInBytes 
 val sizeInBytes = if (session.sessionState.conf.fallBackToHdfsForStatsEnabled) { 
   try { 
     val hadoopConf = session.sessionState.newHadoopConf() 
     val tablePath = new Path(table.location) 
     val fs: FileSystem = tablePath.getFileSystem(hadoopConf) 
     fs.getContentSummary(tablePath).getLength 
   } catch { 
     case e: IOException => 
       logWarning("Failed to get table size from hdfs.", e) 
       session.sessionState.conf.defaultSizeInBytes 
   } 
 } else { 
   session.sessionState.conf.defaultSizeInBytes 
 } 
 override def computeStats(): Statistics = { 
   catalogTable 
     .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled))) 
     .getOrElse(Statistics(sizeInBytes = relation.sizeInBytes)) 
 } 
 def toPlanStats(planOutput: Seq[Attribute], cboEnabled: Boolean): Statistics = { 
   if (cboEnabled && rowCount.isDefined) { 
     val attrStats = AttributeMap(planOutput.flatMap(a => colStats.get(a.name).map(a -> _))) 
     // Estimate size as number of rows * row size. 
     val size = EstimationUtils.getOutputSize(planOutput, rowCount.get, attrStats) 
     Statistics(sizeInBytes = size, rowCount = rowCount, attributeStats = attrStats) 
   } else { 
     // When CBO is disabled or the table doesn't have other statistics, we apply the size-only 
     // estimation strategy and only propagate sizeInBytes in statistics. 
     Statistics(sizeInBytes = sizeInBytes) 
   } 
 } 
 override def sizeInBytes: Long = location.sizeInBytes 
 val sizeInBytes = if (session.sessionState.conf.fallBackToHdfsForStatsEnabled) { 
   try { 
     val hadoopConf = session.sessionState.newHadoopConf() 
     val tablePath = new Path(table.location) 
     val fs: FileSystem = tablePath.getFileSystem(hadoopConf) 
     fs.getContentSummary(tablePath).getLength 
   } catch { 
     case e: IOException => 
       logWarning("Failed to get table size from hdfs.", e) 
       session.sessionState.conf.defaultSizeInBytes 
   } 
 } else { 
   session.sessionState.conf.defaultSizeInBytes 
 } 
+        inMemoryRelation.setStatsFromCachedPlan(planToCache)
+      }
+      cachedData.add(CachedData(planToCache, inMemoryRelation))
     }
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -25,13 +25,15 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
-import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.LongAccumulator
 
 
 object InMemoryRelation {
+
   def apply(
       useCompression: Boolean,
       batchSize: Int,
@@ -71,14 +73,20 @@ case class InMemoryRelation(
 
   override def computeStats(): Statistics = {
     if (batchStats.value == 0L) {
-      // Underlying columnar RDD hasn't been materialized, no useful statistics information
-      // available, return the default statistics.
-      Statistics(sizeInBytes = child.sqlContext.conf.defaultSizeInBytes)
+      inheritedStats.getOrElse(Statistics(sizeInBytes = child.sqlContext.conf.defaultSizeInBytes))
     } else {
       Statistics(sizeInBytes = batchStats.value.longValue)
     }
   }
 
+  private var inheritedStats: Option[Statistics] = _
+
+  private[execution] def setStatsFromCachedPlan(planToCache: LogicalPlan): Unit = {
+    require(planToCache.conf.cboEnabled, "you cannot use the stats of cached plan in" +
+      " InMemoryRelation without cbo enabled")
+    inheritedStats = Some(planToCache.stats)
+  }
+
   // If the cached column buffers were not passed in, we calculate them in the constructor.
   // As in Spark, the actual work of caching is lazy.
   if (_cachedColumnBuffers == null) {