Skip to content

Commit

Permalink
s/Estimates/Statistics, s/cardinality/numTuples.
Browse files Browse the repository at this point in the history
  • Loading branch information
concretevitamin committed Jul 29, 2014
1 parent de3ae13 commit 7a60ab7
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ import org.apache.spark.sql.catalyst.trees
abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
self: Product =>

protected class Estimates {
lazy val childrenEstimations = children.map(_.estimates)
lazy val cardinality: Long = childrenEstimations.map(_.cardinality).sum
lazy val sizeInBytes: Long = childrenEstimations.map(_.sizeInBytes).sum
protected class Statistics {
lazy val childrenStats = children.map(_.statistics)
lazy val numTuples: Long = childrenStats.map(_.numTuples).sum
lazy val sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
}

/**
* Estimates of various statistics.
*/
lazy val estimates: Estimates = new Estimates
lazy val statistics: Statistics = new Statistics

/**
* Returns the set of attributes that are referenced by this node
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
condition,
left,
right)
if right.estimates.sizeInBytes <= sqlContext.autoConvertJoinSize =>
if right.statistics.sizeInBytes <= sqlContext.autoConvertJoinSize =>
broadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildRight)

case ExtractEquiJoinKeys(
Expand All @@ -79,7 +79,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
condition,
left,
right)
if left.estimates.sizeInBytes <= sqlContext.autoConvertJoinSize =>
if left.statistics.sizeInBytes <= sqlContext.autoConvertJoinSize =>
broadcastHashJoin(leftKeys, rightKeys, left, right, condition, BuildLeft)

case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) =>
Expand Down Expand Up @@ -271,8 +271,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
execution.Limit(limit, planLater(child))(sqlContext) :: Nil
case Unions(unionChildren) =>
execution.Union(unionChildren.map(planLater))(sqlContext) :: Nil
case logical.Except(left,right) =>
execution.Except(planLater(left),planLater(right)) :: Nil
case logical.Except(left,right) =>
execution.Except(planLater(left),planLater(right)) :: Nil
case logical.Intersect(left, right) =>
execution.Intersect(planLater(left), planLater(right)) :: Nil
case logical.Generate(generator, join, outer, _, child) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ private[sql] case class ParquetRelation(

self: Product =>

@transient override lazy val estimates = new Estimates {
@transient override lazy val statistics = new Statistics {
// TODO: investigate getting encoded column statistics in the parquet file?
override lazy val sizeInBytes: Long = {
val hdfsPath = new Path(path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ private[hive] case class MetastoreRelation
}

// TODO: are there any stats in hiveQlTable.getSkewedInfo that we can use?
@transient override lazy val estimates = new Estimates {
@transient override lazy val statistics = new Statistics {
// TODO: check if this estimate is valid for tables after partition pruning.
// Size getters adapted from SizeBasedBigTableSelectorForAutoSMJ.java in Hive (version 0.13).
override lazy val sizeInBytes: Long =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.parquet.{ParquetRelation, ParquetTestData}
import org.apache.spark.util.Utils

class EstimatesSuite extends QueryTest {
class StatisticsSuite extends QueryTest {

test("estimates the size of a test ParquetRelation") {
ParquetTestData.writeFile()
val testRDD = parquetFile(ParquetTestData.testDir.toString)

val sizes = testRDD.logicalPlan.collect { case j: ParquetRelation =>
(j.estimates.sizeInBytes, j.newInstance.estimates.sizeInBytes)
(j.statistics.sizeInBytes, j.newInstance.statistics.sizeInBytes)
}
assert(sizes.size === 1)
assert(sizes(0)._1 == sizes(0)._2, "after .newInstance, estimates are different from before")
Expand All @@ -44,7 +44,7 @@ class EstimatesSuite extends QueryTest {
test("estimates the size of a test MetastoreRelation") {
val rdd = hql("""SELECT * FROM src""")
val sizes = rdd.queryExecution.analyzed.collect { case mr: MetastoreRelation =>
mr.estimates.sizeInBytes
mr.statistics.sizeInBytes
}
assert(sizes.size === 1 && sizes(0) > 0)
}
Expand All @@ -63,7 +63,7 @@ class EstimatesSuite extends QueryTest {
// Assert src has a size smaller than the threshold.
val sizes = rdd.queryExecution.analyzed.collect {
case r if ct.runtimeClass.isAssignableFrom(r.getClass) =>
r.estimates.sizeInBytes
r.statistics.sizeInBytes
}
assert(sizes.size === 2 && sizes(0) <= autoConvertJoinSize,
s"query should contain two relations, each of which has size smaller than autoConvertSize")
Expand Down

0 comments on commit 7a60ab7

Please sign in to comment.