Skip to content

Commit

Permalink
[SPARK-21322][SQL][FOLLOWUP] support histogram in filter cardinality …
Browse files Browse the repository at this point in the history
…estimation

## What changes were proposed in this pull request?

some code cleanup/refactor and naming improvement.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19952 from cloud-fan/minor.
  • Loading branch information
cloud-fan committed Dec 13, 2017
1 parent 13e489b commit bdb5e55
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 127 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,10 @@ object EstimationUtils {
}

/**
* Returns the number of the first bin into which a column value falls for a specified
* Returns the index of the first bin into which the given value falls for a specified
* numeric equi-height histogram.
*
* @param value a literal value of a column
* @param bins an array of bins for a given numeric equi-height histogram
* @return the id of the first bin into which a column value falls.
*/
def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
private def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
var i = 0
while ((i < bins.length) && (value > bins(i).hi)) {
i += 1
Expand All @@ -131,14 +127,10 @@ object EstimationUtils {
}

/**
* Returns the number of the last bin into which a column value falls for a specified
* Returns the index of the last bin into which the given value falls for a specified
* numeric equi-height histogram.
*
* @param value a literal value of a column
* @param bins an array of bins for a given numeric equi-height histogram
* @return the id of the last bin into which a column value falls.
*/
def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
private def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
var i = bins.length - 1
while ((i >= 0) && (value < bins(i).lo)) {
i -= 1
Expand All @@ -147,65 +139,76 @@ object EstimationUtils {
}

/**
* Returns a percentage of a bin holding values for column value in the range of
* [lowerValue, higherValue]
*
* @param higherValue a given upper bound value of a specified column value range
* @param lowerValue a given lower bound value of a specified column value range
* @param bin a single histogram bin
* @return the percentage of a single bin holding values in [lowerValue, higherValue].
* Returns the possibility of the given histogram bin holding values within the given range
* [lowerBound, upperBound].
*/
private def getOccupation(
higherValue: Double,
lowerValue: Double,
private def binHoldingRangePossibility(
upperBound: Double,
lowerBound: Double,
bin: HistogramBin): Double = {
assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi)
assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bin.hi)
if (bin.hi == bin.lo) {
// the entire bin is covered in the range
1.0
} else if (higherValue == lowerValue) {
} else if (upperBound == lowerBound) {
// set percentage to 1/NDV
1.0 / bin.ndv.toDouble
} else {
// Use proration since the range falls inside this bin.
math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0)
}
}

/**
* Returns the number of bins for column values in [lowerValue, higherValue].
* The column value distribution is saved in an equi-height histogram. The return values is a
* double value is because we may return a portion of a bin. For example, a predicate
* "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values.
* Returns the number of histogram bins holding values within the given range
* [lowerBound, upperBound].
*
* Note that the returned value is double type, because the range boundaries usually occupy a
* portion of a bin. An extreme case is [value, value] which is generated by equal predicate
* `col = value`, we can get higher accuracy by allowing returning portion of histogram bins.
*
* @param higherId id of the high end bin holding the high end value of a column range
* @param lowerId id of the low end bin holding the low end value of a column range
* @param higherEnd a given upper bound value of a specified column value range
* @param lowerEnd a given lower bound value of a specified column value range
* @param histogram a numeric equi-height histogram
* @return the number of bins for column values in [lowerEnd, higherEnd].
* @param upperBound the highest value of the given range
* @param upperBoundInclusive whether the upperBound is included in the range
* @param lowerBound the lowest value of the given range
* @param lowerBoundInclusive whether the lowerBound is included in the range
* @param bins an array of bins for a given numeric equi-height histogram
*/
def getOccupationBins(
higherId: Int,
lowerId: Int,
higherEnd: Double,
lowerEnd: Double,
histogram: Histogram): Double = {
assert(lowerId <= higherId)

if (lowerId == higherId) {
val curBin = histogram.bins(lowerId)
getOccupation(higherEnd, lowerEnd, curBin)
def numBinsHoldingRange(
upperBound: Double,
upperBoundInclusive: Boolean,
lowerBound: Double,
lowerBoundInclusive: Boolean,
bins: Array[HistogramBin]): Double = {
assert(bins.head.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bins.last.hi,
"Given range does not fit in the given histogram.")
assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive,
s"'$lowerBound < value < $upperBound' is an invalid range.")

val upperBinIndex = if (upperBoundInclusive) {
findLastBinForValue(upperBound, bins)
} else {
findFirstBinForValue(upperBound, bins)
}
val lowerBinIndex = if (lowerBoundInclusive) {
findFirstBinForValue(lowerBound, bins)
} else {
findLastBinForValue(lowerBound, bins)
}
assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.")


if (lowerBinIndex == upperBinIndex) {
binHoldingRangePossibility(upperBound, lowerBound, bins(lowerBinIndex))
} else {
// compute how much lowerEnd/higherEnd occupies its bin
val lowerCurBin = histogram.bins(lowerId)
val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin)
// Computes the occupied portion of bins of the upperBound and lowerBound.
val lowerBin = bins(lowerBinIndex)
val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin)

val higherCurBin = histogram.bins(higherId)
val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin)
val higherBin = bins(upperBinIndex)
val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin)

// the total length is lowerPart + higherPart + bins between them
lowerPart + higherPart + higherId - lowerId - 1
// The total number of bins is lowerPart + higherPart + bins between them
lowerPart + higherPart + upperBinIndex - lowerBinIndex - 1
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging {
// returns 1/ndv if there is no histogram
Some(1.0 / BigDecimal(ndv))
} else {
// We compute filter selectivity using Histogram information.
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
val histogram = colStat.histogram.get
val hgmBins = histogram.bins

// find bins where column's current min and max locate. Note that a column's [min, max]
// range may change due to another condition applied earlier.
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins)
val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins)

// compute how many bins the column's current valid range [min, max] occupies.
// Note that a column's [min, max] range may vary after we apply some filter conditions.
val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max,
min, histogram)

val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins)
val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins)
assert(lowerBinId <= higherBinId)
val lowerBinNdv = hgmBins(lowerBinId).ndv
val higherBinNdv = hgmBins(higherBinId).ndv
// assume uniform distribution in each bin
val occupiedBins = if (lowerBinId == higherBinId) {
1.0 / lowerBinNdv
} else {
(1.0 / lowerBinNdv) + // lowest bin
(higherBinId - lowerBinId - 1) + // middle bins
(1.0 / higherBinNdv) // highest bin
}
Some(occupiedBins / validRangeBins)
Some(computeEqualityPossibilityByHistogram(literal, colStat))
}

} else { // not in interval
Some(0.0)
}

}

/**
Expand Down Expand Up @@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
}
}
} else {
val numericHistogram = colStat.histogram.get
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum)
percent = computeComparisonPossibilityByHistogram(op, literal, colStat)
}

if (update) {
Expand Down Expand Up @@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging {
}

/**
* Returns the selectivity percentage for binary condition in the column's
* current valid range [min, max]
*
* @param op a binary comparison operator
* @param histogram a numeric equi-height histogram
* @param max the upper bound of the current valid range for a given column
* @param min the lower bound of the current valid range for a given column
* @param datumNumber the numeric value of a literal
* @return the selectivity percentage for a condition in the current range.
* Computes the possibility of an equality predicate using histogram.
*/
private def computeEqualityPossibilityByHistogram(
literal: Literal, colStat: ColumnStat): Double = {
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
val histogram = colStat.histogram.get

def computePercentByEquiHeightHgm(
op: BinaryComparison,
histogram: Histogram,
max: Double,
min: Double,
datumNumber: Double): Double = {
// find bins where column's current min and max locate. Note that a column's [min, max]
// range may change due to another condition applied earlier.
val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins)
val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble

// compute how many bins the column's current valid range [min, max] occupies.
// Note that a column's [min, max] range may vary after we apply some filter conditions.
val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram)

val datumInBinId = op match {
case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
}
val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
upperBound = max,
upperBoundInclusive = true,
lowerBound = min,
lowerBoundInclusive = true,
histogram.bins)

val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
upperBound = datum,
upperBoundInclusive = true,
lowerBound = datum,
lowerBoundInclusive = true,
histogram.bins)

numBinsHoldingDatum / numBinsHoldingEntireRange
}

op match {
// LessThan and LessThanOrEqual share the same logic,
// but their datumInBinId may be different
case LessThan(_, _) | LessThanOrEqual(_, _) =>
EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min,
histogram) / minToMaxLength
// GreaterThan and GreaterThanOrEqual share the same logic,
// but their datumInBinId may be different
case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber,
histogram) / minToMaxLength
/**
* Computes the possibility of a comparison predicate using histogram.
*/
private def computeComparisonPossibilityByHistogram(
op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = {
val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
val histogram = colStat.histogram.get

// find bins where column's current min and max locate. Note that a column's [min, max]
// range may change due to another condition applied earlier.
val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble

// compute how many bins the column's current valid range [min, max] occupies.
val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram.bins)

val numBinsHoldingRange = op match {
// LessThan and LessThanOrEqual share the same logic, the only difference is whether to
// include the upperBound in the range.
case _: LessThan =>
EstimationUtils.numBinsHoldingRange(
upperBound = datum,
upperBoundInclusive = false,
lowerBound = min,
lowerBoundInclusive = true,
histogram.bins)
case _: LessThanOrEqual =>
EstimationUtils.numBinsHoldingRange(
upperBound = datum,
upperBoundInclusive = true,
lowerBound = min,
lowerBoundInclusive = true,
histogram.bins)

// GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to
// include the lowerBound in the range.
case _: GreaterThan =>
EstimationUtils.numBinsHoldingRange(
upperBound = max,
upperBoundInclusive = true,
lowerBound = datum,
lowerBoundInclusive = false,
histogram.bins)
case _: GreaterThanOrEqual =>
EstimationUtils.numBinsHoldingRange(
upperBound = max,
upperBoundInclusive = true,
lowerBound = datum,
lowerBoundInclusive = true,
histogram.bins)
}

numBinsHoldingRange / numBinsHoldingEntireRange
}

/**
Expand Down

0 comments on commit bdb5e55

Please sign in to comment.