diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 738231cc13eb3..3ac9b8d8e42ef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -26,16 +26,16 @@ import org.apache.spark.sql.catalyst.trees abstract class LogicalPlan extends QueryPlan[LogicalPlan] { self: Product => - protected class Estimates { - lazy val childrenEstimations = children.map(_.estimates) - lazy val cardinality: Long = childrenEstimations.map(_.cardinality).sum - lazy val sizeInBytes: Long = childrenEstimations.map(_.sizeInBytes).sum + protected class Statistics { + lazy val childrenStats = children.map(_.statistics) + lazy val numTuples: Long = childrenStats.map(_.numTuples).sum + lazy val sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum } /** * Estimates of various statistics. */ - lazy val estimates: Estimates = new Estimates + lazy val statistics: Statistics = new Statistics /** * Returns the set of attributes that are referenced by this node diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 58cc3f92cf851..bafc5aaff3f03 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -69,7 +69,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { condition, left, right) - if right.estimates.sizeInBytes <= sqlContext.autoConvertJoinSize => + if right.statistics.sizeInBytes <= sqlContext.autoConvertJoinSize => broadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildRight) case ExtractEquiJoinKeys( @@ -79,7 +79,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { condition, left, right) - if left.estimates.sizeInBytes <= sqlContext.autoConvertJoinSize => + if left.statistics.sizeInBytes <= sqlContext.autoConvertJoinSize => broadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildLeft) case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) => @@ -271,8 +271,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { execution.Limit(limit, planLater(child))(sqlContext) :: Nil case Unions(unionChildren) => execution.Union(unionChildren.map(planLater))(sqlContext) :: Nil - case logical.Except(left,right) => - execution.Except(planLater(left),planLater(right)) :: Nil + case logical.Except(left,right) => + execution.Except(planLater(left),planLater(right)) :: Nil case logical.Intersect(left, right) => execution.Intersect(planLater(left), planLater(right)) :: Nil case logical.Generate(generator, join, outer, _, child) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala index 6e24bc951d612..f70aff4fef698 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala @@ -53,7 +53,7 @@ private[sql] case class ParquetRelation( self: Product => - @transient override lazy val estimates = new Estimates { + @transient override lazy val statistics = new Statistics { // TODO: investigate getting encoded column statistics in the parquet file? override lazy val sizeInBytes: Long = { val hdfsPath = new Path(path) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 246f804bb5d36..a504405701dd3 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -270,7 +270,7 @@ private[hive] case class MetastoreRelation } // TODO: are there any stats in hiveQlTable.getSkewedInfo that we can use? - @transient override lazy val estimates = new Estimates { + @transient override lazy val statistics = new Statistics { // TODO: check if this estimate is valid for tables after partition pruning. // Size getters adapted from SizeBasedBigTableSelectorForAutoSMJ.java in Hive (version 0.13). override lazy val sizeInBytes: Long = diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/EstimatesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala similarity index 95% rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/EstimatesSuite.scala rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 7e86aba6f3cee..c004eb7a588f7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/EstimatesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -25,14 +25,14 @@ import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.parquet.{ParquetRelation, ParquetTestData} import org.apache.spark.util.Utils -class EstimatesSuite extends QueryTest { +class StatisticsSuite extends QueryTest { test("estimates the size of a test ParquetRelation") { ParquetTestData.writeFile() val testRDD = parquetFile(ParquetTestData.testDir.toString) val sizes = testRDD.logicalPlan.collect { case j: ParquetRelation => - (j.estimates.sizeInBytes, j.newInstance.estimates.sizeInBytes) + (j.statistics.sizeInBytes, j.newInstance.statistics.sizeInBytes) } assert(sizes.size === 1) assert(sizes(0)._1 == sizes(0)._2, "after .newInstance, estimates are different from before") @@ -44,7 +44,7 @@ class EstimatesSuite extends QueryTest { test("estimates the size of a test MetastoreRelation") { val rdd = hql("""SELECT * FROM src""") val sizes = rdd.queryExecution.analyzed.collect { case mr: MetastoreRelation => - mr.estimates.sizeInBytes + mr.statistics.sizeInBytes } assert(sizes.size === 1 && sizes(0) > 0) } @@ -63,7 +63,7 @@ class EstimatesSuite extends QueryTest { // Assert src has a size smaller than the threshold. val sizes = rdd.queryExecution.analyzed.collect { case r if ct.runtimeClass.isAssignableFrom(r.getClass) => - r.estimates.sizeInBytes + r.statistics.sizeInBytes } assert(sizes.size === 2 && sizes(0) <= autoConvertJoinSize, s"query should contain two relations, each of which has size smaller than autoConvertSize")