diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala index 2f416f2af91f1..6f868cbd072c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -115,14 +115,10 @@ object EstimationUtils { } /** - * Returns the number of the first bin into which a column value falls for a specified + * Returns the index of the first bin into which the given value falls for a specified * numeric equi-height histogram. - * - * @param value a literal value of a column - * @param bins an array of bins for a given numeric equi-height histogram - * @return the id of the first bin into which a column value falls. */ - def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = { + private def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = { var i = 0 while ((i < bins.length) && (value > bins(i).hi)) { i += 1 @@ -131,14 +127,10 @@ object EstimationUtils { } /** - * Returns the number of the last bin into which a column value falls for a specified + * Returns the index of the last bin into which the given value falls for a specified * numeric equi-height histogram. - * - * @param value a literal value of a column - * @param bins an array of bins for a given numeric equi-height histogram - * @return the id of the last bin into which a column value falls. */ - def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = { + private def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = { var i = bins.length - 1 while ((i >= 0) && (value < bins(i).lo)) { i -= 1 @@ -147,65 +139,76 @@ object EstimationUtils { } /** - * Returns a percentage of a bin holding values for column value in the range of - * [lowerValue, higherValue] - * - * @param higherValue a given upper bound value of a specified column value range - * @param lowerValue a given lower bound value of a specified column value range - * @param bin a single histogram bin - * @return the percentage of a single bin holding values in [lowerValue, higherValue]. + * Returns the possibility of the given histogram bin holding values within the given range + * [lowerBound, upperBound]. */ - private def getOccupation( - higherValue: Double, - lowerValue: Double, + private def binHoldingRangePossibility( + upperBound: Double, + lowerBound: Double, bin: HistogramBin): Double = { - assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi) + assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bin.hi) if (bin.hi == bin.lo) { // the entire bin is covered in the range 1.0 - } else if (higherValue == lowerValue) { + } else if (upperBound == lowerBound) { // set percentage to 1/NDV 1.0 / bin.ndv.toDouble } else { // Use proration since the range falls inside this bin. - math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0) + math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0) } } /** - * Returns the number of bins for column values in [lowerValue, higherValue]. - * The column value distribution is saved in an equi-height histogram. The return values is a - * double value is because we may return a portion of a bin. For example, a predicate - * "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values. + * Returns the number of histogram bins holding values within the given range + * [lowerBound, upperBound]. + * + * Note that the returned value is double type, because the range boundaries usually occupy a + * portion of a bin. An extreme case is [value, value] which is generated by equal predicate + * `col = value`, we can get higher accuracy by allowing returning portion of histogram bins. * - * @param higherId id of the high end bin holding the high end value of a column range - * @param lowerId id of the low end bin holding the low end value of a column range - * @param higherEnd a given upper bound value of a specified column value range - * @param lowerEnd a given lower bound value of a specified column value range - * @param histogram a numeric equi-height histogram - * @return the number of bins for column values in [lowerEnd, higherEnd]. + * @param upperBound the highest value of the given range + * @param upperBoundInclusive whether the upperBound is included in the range + * @param lowerBound the lowest value of the given range + * @param lowerBoundInclusive whether the lowerBound is included in the range + * @param bins an array of bins for a given numeric equi-height histogram */ - def getOccupationBins( - higherId: Int, - lowerId: Int, - higherEnd: Double, - lowerEnd: Double, - histogram: Histogram): Double = { - assert(lowerId <= higherId) - - if (lowerId == higherId) { - val curBin = histogram.bins(lowerId) - getOccupation(higherEnd, lowerEnd, curBin) + def numBinsHoldingRange( + upperBound: Double, + upperBoundInclusive: Boolean, + lowerBound: Double, + lowerBoundInclusive: Boolean, + bins: Array[HistogramBin]): Double = { + assert(bins.head.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bins.last.hi, + "Given range does not fit in the given histogram.") + assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive, + s"'$lowerBound < value < $upperBound' is an invalid range.") + + val upperBinIndex = if (upperBoundInclusive) { + findLastBinForValue(upperBound, bins) + } else { + findFirstBinForValue(upperBound, bins) + } + val lowerBinIndex = if (lowerBoundInclusive) { + findFirstBinForValue(lowerBound, bins) + } else { + findLastBinForValue(lowerBound, bins) + } + assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.") + + + if (lowerBinIndex == upperBinIndex) { + binHoldingRangePossibility(upperBound, lowerBound, bins(lowerBinIndex)) } else { - // compute how much lowerEnd/higherEnd occupies its bin - val lowerCurBin = histogram.bins(lowerId) - val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin) + // Computes the occupied portion of bins of the upperBound and lowerBound. + val lowerBin = bins(lowerBinIndex) + val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin) - val higherCurBin = histogram.bins(higherId) - val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin) + val higherBin = bins(upperBinIndex) + val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin) - // the total length is lowerPart + higherPart + bins between them - lowerPart + higherPart + higherId - lowerId - 1 + // The total number of bins is lowerPart + higherPart + bins between them + lowerPart + higherPart + upperBinIndex - lowerBinIndex - 1 } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala index f52a15edbfe4a..850dd1ba724a0 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala @@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging { // returns 1/ndv if there is no histogram Some(1.0 / BigDecimal(ndv)) } else { - // We compute filter selectivity using Histogram information. - val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble - val histogram = colStat.histogram.get - val hgmBins = histogram.bins - - // find bins where column's current min and max locate. Note that a column's [min, max] - // range may change due to another condition applied earlier. - val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble - val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble - val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins) - val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins) - - // compute how many bins the column's current valid range [min, max] occupies. - // Note that a column's [min, max] range may vary after we apply some filter conditions. - val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, - min, histogram) - - val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins) - val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins) - assert(lowerBinId <= higherBinId) - val lowerBinNdv = hgmBins(lowerBinId).ndv - val higherBinNdv = hgmBins(higherBinId).ndv - // assume uniform distribution in each bin - val occupiedBins = if (lowerBinId == higherBinId) { - 1.0 / lowerBinNdv - } else { - (1.0 / lowerBinNdv) + // lowest bin - (higherBinId - lowerBinId - 1) + // middle bins - (1.0 / higherBinNdv) // highest bin - } - Some(occupiedBins / validRangeBins) + Some(computeEqualityPossibilityByHistogram(literal, colStat)) } } else { // not in interval Some(0.0) } - } /** @@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging { } } } else { - val numericHistogram = colStat.histogram.get - val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble - val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble - val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble - percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum) + percent = computeComparisonPossibilityByHistogram(op, literal, colStat) } if (update) { @@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging { } /** - * Returns the selectivity percentage for binary condition in the column's - * current valid range [min, max] - * - * @param op a binary comparison operator - * @param histogram a numeric equi-height histogram - * @param max the upper bound of the current valid range for a given column - * @param min the lower bound of the current valid range for a given column - * @param datumNumber the numeric value of a literal - * @return the selectivity percentage for a condition in the current range. + * Computes the possibility of an equality predicate using histogram. */ + private def computeEqualityPossibilityByHistogram( + literal: Literal, colStat: ColumnStat): Double = { + val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble + val histogram = colStat.histogram.get - def computePercentByEquiHeightHgm( - op: BinaryComparison, - histogram: Histogram, - max: Double, - min: Double, - datumNumber: Double): Double = { // find bins where column's current min and max locate. Note that a column's [min, max] // range may change due to another condition applied earlier. - val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins) - val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins) + val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble + val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble // compute how many bins the column's current valid range [min, max] occupies. - // Note that a column's [min, max] range may vary after we apply some filter conditions. - val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram) - - val datumInBinId = op match { - case LessThan(_, _) | GreaterThanOrEqual(_, _) => - EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins) - case LessThanOrEqual(_, _) | GreaterThan(_, _) => - EstimationUtils.findLastBinForValue(datumNumber, histogram.bins) - } + val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange( + upperBound = max, + upperBoundInclusive = true, + lowerBound = min, + lowerBoundInclusive = true, + histogram.bins) + + val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange( + upperBound = datum, + upperBoundInclusive = true, + lowerBound = datum, + lowerBoundInclusive = true, + histogram.bins) + + numBinsHoldingDatum / numBinsHoldingEntireRange + } - op match { - // LessThan and LessThanOrEqual share the same logic, - // but their datumInBinId may be different - case LessThan(_, _) | LessThanOrEqual(_, _) => - EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min, - histogram) / minToMaxLength - // GreaterThan and GreaterThanOrEqual share the same logic, - // but their datumInBinId may be different - case GreaterThan(_, _) | GreaterThanOrEqual(_, _) => - EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber, - histogram) / minToMaxLength + /** + * Computes the possibility of a comparison predicate using histogram. + */ + private def computeComparisonPossibilityByHistogram( + op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = { + val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble + val histogram = colStat.histogram.get + + // find bins where column's current min and max locate. Note that a column's [min, max] + // range may change due to another condition applied earlier. + val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble + val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble + + // compute how many bins the column's current valid range [min, max] occupies. + val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange( + max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram.bins) + + val numBinsHoldingRange = op match { + // LessThan and LessThanOrEqual share the same logic, the only difference is whether to + // include the upperBound in the range. + case _: LessThan => + EstimationUtils.numBinsHoldingRange( + upperBound = datum, + upperBoundInclusive = false, + lowerBound = min, + lowerBoundInclusive = true, + histogram.bins) + case _: LessThanOrEqual => + EstimationUtils.numBinsHoldingRange( + upperBound = datum, + upperBoundInclusive = true, + lowerBound = min, + lowerBoundInclusive = true, + histogram.bins) + + // GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to + // include the lowerBound in the range. + case _: GreaterThan => + EstimationUtils.numBinsHoldingRange( + upperBound = max, + upperBoundInclusive = true, + lowerBound = datum, + lowerBoundInclusive = false, + histogram.bins) + case _: GreaterThanOrEqual => + EstimationUtils.numBinsHoldingRange( + upperBound = max, + upperBoundInclusive = true, + lowerBound = datum, + lowerBoundInclusive = true, + histogram.bins) } + + numBinsHoldingRange / numBinsHoldingEntireRange } /**