From ebcd6d107087cf024362b3a901514cfc12c3dbab Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 12 Dec 2017 22:36:02 +0800 Subject: [PATCH 1/3] code cleanup --- .../statsEstimation/EstimationUtils.scala | 105 ++++++------ .../statsEstimation/FilterEstimation.scala | 152 +++++++++--------- 2 files changed, 133 insertions(+), 124 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala index 2f416f2af91f1..f2bc8be9106fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -115,14 +115,10 @@ object EstimationUtils { } /** - * Returns the number of the first bin into which a column value falls for a specified + * Returns the index of the first bin into which the given value falls for a specified * numeric equi-height histogram. - * - * @param value a literal value of a column - * @param bins an array of bins for a given numeric equi-height histogram - * @return the id of the first bin into which a column value falls. */ - def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = { + private def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = { var i = 0 while ((i < bins.length) && (value > bins(i).hi)) { i += 1 @@ -131,14 +127,10 @@ object EstimationUtils { } /** - * Returns the number of the last bin into which a column value falls for a specified + * Returns the index of the last bin into which the given value falls for a specified * numeric equi-height histogram. - * - * @param value a literal value of a column - * @param bins an array of bins for a given numeric equi-height histogram - * @return the id of the last bin into which a column value falls. */ - def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = { + private def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = { var i = bins.length - 1 while ((i >= 0) && (value < bins(i).lo)) { i -= 1 @@ -147,65 +139,78 @@ object EstimationUtils { } /** - * Returns a percentage of a bin holding values for column value in the range of - * [lowerValue, higherValue] - * - * @param higherValue a given upper bound value of a specified column value range - * @param lowerValue a given lower bound value of a specified column value range - * @param bin a single histogram bin - * @return the percentage of a single bin holding values in [lowerValue, higherValue]. + * Returns the possibility of the given histogram bin holding values within the given range + * [lowerBound, upperBound]. */ - private def getOccupation( - higherValue: Double, - lowerValue: Double, + private def binHoldingRangePossibility( + upperBound: Double, + lowerBound: Double, bin: HistogramBin): Double = { - assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi) + assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bin.hi) if (bin.hi == bin.lo) { // the entire bin is covered in the range 1.0 - } else if (higherValue == lowerValue) { + } else if (upperBound == lowerBound) { // set percentage to 1/NDV 1.0 / bin.ndv.toDouble } else { // Use proration since the range falls inside this bin. - math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0) + math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0) } } /** - * Returns the number of bins for column values in [lowerValue, higherValue]. - * The column value distribution is saved in an equi-height histogram. The return values is a - * double value is because we may return a portion of a bin. For example, a predicate - * "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values. + * Returns the number of histogram bins holding values within the given range + * [lowerBound, upperBound]. * - * @param higherId id of the high end bin holding the high end value of a column range - * @param lowerId id of the low end bin holding the low end value of a column range - * @param higherEnd a given upper bound value of a specified column value range - * @param lowerEnd a given lower bound value of a specified column value range + * Note that the return value is double type, because the range boundaries usually occupy a + * portion of a bin. An extrema case is [value, value] which is generated by equal predicate + * `col = value`, we can get more accuracy by allowing returning portion of histogram bins. + * + * @param upperBound the highest value of the given range + * @param upperBoundInclusive whether the upperBound is included in the range + * @param lowerBound the lowest value of the given range + * @param lowerBoundInclusive whether the lowerBound is included in the range * @param histogram a numeric equi-height histogram - * @return the number of bins for column values in [lowerEnd, higherEnd]. */ - def getOccupationBins( - higherId: Int, - lowerId: Int, - higherEnd: Double, - lowerEnd: Double, + def numBinsHoldingRange( + upperBound: Double, + upperBoundInclusive: Boolean, + lowerBound: Double, + lowerBoundInclusive: Boolean, histogram: Histogram): Double = { - assert(lowerId <= higherId) + assert(histogram.bins.head.lo <= lowerBound && + lowerBound <= upperBound && + upperBound <= histogram.bins.last.hi, + "Given range does not fit in the given histogram.") + assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive, + s"'$lowerBound < value < $upperBound' is an invalid range.") + + val upperBinIndex = if (upperBoundInclusive) { + findLastBinForValue(upperBound, histogram.bins) + } else { + findFirstBinForValue(upperBound, histogram.bins) + } + val lowerBinIndex = if (lowerBoundInclusive) { + findFirstBinForValue(lowerBound, histogram.bins) + } else { + findLastBinForValue(lowerBound, histogram.bins) + } + assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.") + - if (lowerId == higherId) { - val curBin = histogram.bins(lowerId) - getOccupation(higherEnd, lowerEnd, curBin) + if (lowerBinIndex == upperBinIndex) { + binHoldingRangePossibility(upperBound, lowerBound, histogram.bins(lowerBinIndex)) } else { - // compute how much lowerEnd/higherEnd occupies its bin - val lowerCurBin = histogram.bins(lowerId) - val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin) + // Computes the occupied portion of bins of the upperBound and lowerBound. + val lowerBin = histogram.bins(lowerBinIndex) + val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin) - val higherCurBin = histogram.bins(higherId) - val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin) + val higherBin = histogram.bins(upperBinIndex) + val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin) - // the total length is lowerPart + higherPart + bins between them - lowerPart + higherPart + higherId - lowerId - 1 + // The total number of bins is lowerPart + higherPart + bins between them + lowerPart + higherPart + upperBinIndex - lowerBinIndex - 1 } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala index f52a15edbfe4a..c2170060fbb2a 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala @@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging { // returns 1/ndv if there is no histogram Some(1.0 / BigDecimal(ndv)) } else { - // We compute filter selectivity using Histogram information. - val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble - val histogram = colStat.histogram.get - val hgmBins = histogram.bins - - // find bins where column's current min and max locate. Note that a column's [min, max] - // range may change due to another condition applied earlier. - val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble - val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble - val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins) - val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins) - - // compute how many bins the column's current valid range [min, max] occupies. - // Note that a column's [min, max] range may vary after we apply some filter conditions. - val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, - min, histogram) - - val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins) - val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins) - assert(lowerBinId <= higherBinId) - val lowerBinNdv = hgmBins(lowerBinId).ndv - val higherBinNdv = hgmBins(higherBinId).ndv - // assume uniform distribution in each bin - val occupiedBins = if (lowerBinId == higherBinId) { - 1.0 / lowerBinNdv - } else { - (1.0 / lowerBinNdv) + // lowest bin - (higherBinId - lowerBinId - 1) + // middle bins - (1.0 / higherBinNdv) // highest bin - } - Some(occupiedBins / validRangeBins) + Some(computeEqualityPossibilityByHistogram(literal, colStat)) } } else { // not in interval Some(0.0) } - } /** @@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging { } } } else { - val numericHistogram = colStat.histogram.get - val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble - val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble - val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble - percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum) + percent = computeComparisonPossibilityByHistogram(op, literal, colStat) } if (update) { @@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging { } /** - * Returns the selectivity percentage for binary condition in the column's - * current valid range [min, max] - * - * @param op a binary comparison operator - * @param histogram a numeric equi-height histogram - * @param max the upper bound of the current valid range for a given column - * @param min the lower bound of the current valid range for a given column - * @param datumNumber the numeric value of a literal - * @return the selectivity percentage for a condition in the current range. + * Computes the possibility of a equal predicate using histogram. */ + private def computeEqualityPossibilityByHistogram( + literal: Literal, colStat: ColumnStat): Double = { + val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble + val histogram = colStat.histogram.get - def computePercentByEquiHeightHgm( - op: BinaryComparison, - histogram: Histogram, - max: Double, - min: Double, - datumNumber: Double): Double = { // find bins where column's current min and max locate. Note that a column's [min, max] // range may change due to another condition applied earlier. - val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins) - val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins) + val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble + val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble // compute how many bins the column's current valid range [min, max] occupies. - // Note that a column's [min, max] range may vary after we apply some filter conditions. - val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram) - - val datumInBinId = op match { - case LessThan(_, _) | GreaterThanOrEqual(_, _) => - EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins) - case LessThanOrEqual(_, _) | GreaterThan(_, _) => - EstimationUtils.findLastBinForValue(datumNumber, histogram.bins) - } + val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange( + upperBound = max, + upperBoundInclusive = true, + lowerBound = min, + lowerBoundInclusive = true, + histogram) + + val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange( + upperBound = datum, + upperBoundInclusive = true, + lowerBound = datum, + lowerBoundInclusive = true, + histogram) + + numBinsHoldingDatum / numBinsHoldingEntireRange + } - op match { - // LessThan and LessThanOrEqual share the same logic, - // but their datumInBinId may be different - case LessThan(_, _) | LessThanOrEqual(_, _) => - EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min, - histogram) / minToMaxLength - // GreaterThan and GreaterThanOrEqual share the same logic, - // but their datumInBinId may be different - case GreaterThan(_, _) | GreaterThanOrEqual(_, _) => - EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber, - histogram) / minToMaxLength + /** + * Computes the possibility of a comparison predicate using histogram. + */ + private def computeComparisonPossibilityByHistogram( + op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = { + val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble + val histogram = colStat.histogram.get + + // find bins where column's current min and max locate. Note that a column's [min, max] + // range may change due to another condition applied earlier. + val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble + val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble + + // compute how many bins the column's current valid range [min, max] occupies. + val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange( + max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram) + + val numBinsHoldingDatum = op match { + // LessThan and LessThanOrEqual share the same logic, the only difference is whether to + // include the upperBound in the range. + case _: LessThan => + EstimationUtils.numBinsHoldingRange( + upperBound = datum, + upperBoundInclusive = false, + lowerBound = min, + lowerBoundInclusive = true, + histogram) + case _: LessThanOrEqual => + EstimationUtils.numBinsHoldingRange( + upperBound = datum, + upperBoundInclusive = true, + lowerBound = min, + lowerBoundInclusive = true, + histogram) + + // GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to + // include the lowerBound in the range. + case _: GreaterThan => + EstimationUtils.numBinsHoldingRange( + upperBound = max, + upperBoundInclusive = true, + lowerBound = datum, + lowerBoundInclusive = false, + histogram) + case _: GreaterThanOrEqual => + EstimationUtils.numBinsHoldingRange( + upperBound = max, + upperBoundInclusive = true, + lowerBound = datum, + lowerBoundInclusive = true, + histogram) } + + numBinsHoldingDatum / numBinsHoldingEntireRange } /** From 8fe0c4991b90781a7017de4938705bbc32244dc6 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 13 Dec 2017 10:51:35 +0800 Subject: [PATCH 2/3] address comments --- .../statsEstimation/EstimationUtils.scala | 26 +++++++++---------- .../statsEstimation/FilterEstimation.scala | 20 +++++++------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala index f2bc8be9106fe..484ad2beffbaa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -163,50 +163,48 @@ object EstimationUtils { * Returns the number of histogram bins holding values within the given range * [lowerBound, upperBound]. * - * Note that the return value is double type, because the range boundaries usually occupy a + * Note that the returned value is double type, because the range boundaries usually occupy a * portion of a bin. An extrema case is [value, value] which is generated by equal predicate - * `col = value`, we can get more accuracy by allowing returning portion of histogram bins. + * `col = value`, we can get higher accuracy by allowing returning portion of histogram bins. * * @param upperBound the highest value of the given range * @param upperBoundInclusive whether the upperBound is included in the range * @param lowerBound the lowest value of the given range * @param lowerBoundInclusive whether the lowerBound is included in the range - * @param histogram a numeric equi-height histogram + * @param bins an array of bins for a given numeric equi-height histogram */ def numBinsHoldingRange( upperBound: Double, upperBoundInclusive: Boolean, lowerBound: Double, lowerBoundInclusive: Boolean, - histogram: Histogram): Double = { - assert(histogram.bins.head.lo <= lowerBound && - lowerBound <= upperBound && - upperBound <= histogram.bins.last.hi, + bins: Array[HistogramBin]): Double = { + assert(bins.head.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bins.last.hi, "Given range does not fit in the given histogram.") assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive, s"'$lowerBound < value < $upperBound' is an invalid range.") val upperBinIndex = if (upperBoundInclusive) { - findLastBinForValue(upperBound, histogram.bins) + findLastBinForValue(upperBound, bins) } else { - findFirstBinForValue(upperBound, histogram.bins) + findFirstBinForValue(upperBound, bins) } val lowerBinIndex = if (lowerBoundInclusive) { - findFirstBinForValue(lowerBound, histogram.bins) + findFirstBinForValue(lowerBound, bins) } else { - findLastBinForValue(lowerBound, histogram.bins) + findLastBinForValue(lowerBound, bins) } assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.") if (lowerBinIndex == upperBinIndex) { - binHoldingRangePossibility(upperBound, lowerBound, histogram.bins(lowerBinIndex)) + binHoldingRangePossibility(upperBound, lowerBound, bins(lowerBinIndex)) } else { // Computes the occupied portion of bins of the upperBound and lowerBound. - val lowerBin = histogram.bins(lowerBinIndex) + val lowerBin = bins(lowerBinIndex) val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin) - val higherBin = histogram.bins(upperBinIndex) + val higherBin = bins(upperBinIndex) val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin) // The total number of bins is lowerPart + higherPart + bins between them diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala index c2170060fbb2a..850dd1ba724a0 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala @@ -539,7 +539,7 @@ case class FilterEstimation(plan: Filter) extends Logging { } /** - * Computes the possibility of a equal predicate using histogram. + * Computes the possibility of an equality predicate using histogram. */ private def computeEqualityPossibilityByHistogram( literal: Literal, colStat: ColumnStat): Double = { @@ -557,14 +557,14 @@ case class FilterEstimation(plan: Filter) extends Logging { upperBoundInclusive = true, lowerBound = min, lowerBoundInclusive = true, - histogram) + histogram.bins) val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange( upperBound = datum, upperBoundInclusive = true, lowerBound = datum, lowerBoundInclusive = true, - histogram) + histogram.bins) numBinsHoldingDatum / numBinsHoldingEntireRange } @@ -584,9 +584,9 @@ case class FilterEstimation(plan: Filter) extends Logging { // compute how many bins the column's current valid range [min, max] occupies. val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange( - max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram) + max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram.bins) - val numBinsHoldingDatum = op match { + val numBinsHoldingRange = op match { // LessThan and LessThanOrEqual share the same logic, the only difference is whether to // include the upperBound in the range. case _: LessThan => @@ -595,14 +595,14 @@ case class FilterEstimation(plan: Filter) extends Logging { upperBoundInclusive = false, lowerBound = min, lowerBoundInclusive = true, - histogram) + histogram.bins) case _: LessThanOrEqual => EstimationUtils.numBinsHoldingRange( upperBound = datum, upperBoundInclusive = true, lowerBound = min, lowerBoundInclusive = true, - histogram) + histogram.bins) // GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to // include the lowerBound in the range. @@ -612,17 +612,17 @@ case class FilterEstimation(plan: Filter) extends Logging { upperBoundInclusive = true, lowerBound = datum, lowerBoundInclusive = false, - histogram) + histogram.bins) case _: GreaterThanOrEqual => EstimationUtils.numBinsHoldingRange( upperBound = max, upperBoundInclusive = true, lowerBound = datum, lowerBoundInclusive = true, - histogram) + histogram.bins) } - numBinsHoldingDatum / numBinsHoldingEntireRange + numBinsHoldingRange / numBinsHoldingEntireRange } /** From 4e35c43957cf27b105c8f6b8ff19621aac540098 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 13 Dec 2017 11:38:31 +0800 Subject: [PATCH 3/3] fix typo --- .../plans/logical/statsEstimation/EstimationUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala index 484ad2beffbaa..6f868cbd072c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -164,7 +164,7 @@ object EstimationUtils { * [lowerBound, upperBound]. * * Note that the returned value is double type, because the range boundaries usually occupy a - * portion of a bin. An extrema case is [value, value] which is generated by equal predicate + * portion of a bin. An extreme case is [value, value] which is generated by equal predicate * `col = value`, we can get higher accuracy by allowing returning portion of histogram bins. * * @param upperBound the highest value of the given range