apache · ron8hu · Nov 19, 2017 · Nov 25, 2017 · Nov 28, 2017 · Nov 29, 2017
diff --git a/...n/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/...n/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
 import scala.math.BigDecimal.RoundingMode
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types.{DecimalType, _}
 
 
@@ -114,4 +114,99 @@ object EstimationUtils {
     }
   }
 
+  /**
+   * Returns the number of the first bin into which a column value falls for a specified
+   * numeric equi-height histogram.
+   *
+   * @param value a literal value of a column
+   * @param bins an array of bins for a given numeric equi-height histogram
+   * @return the id of the first bin into which a column value falls.
+   */
+  def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+    var i = 0
+    while ((i < bins.length) && (value > bins(i).hi)) {
+      i += 1
+    }
+    i
+  }
+
+  /**
+   * Returns the number of the last bin into which a column value falls for a specified
+   * numeric equi-height histogram.
+   *
+   * @param value a literal value of a column
+   * @param bins an array of bins for a given numeric equi-height histogram
+   * @return the id of the last bin into which a column value falls.
+   */
+  def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+    var i = bins.length - 1
+    while ((i >= 0) && (value < bins(i).lo)) {
+      i -= 1
+    }
+    i
+  }
+
+  /**
+   * Returns a percentage of a bin holding values for column value in the range of
+   * [lowerValue, higherValue]
+   *
+   * @param higherValue a given upper bound value of a specified column value range
+   * @param lowerValue a given lower bound value of a specified column value range
+   * @param bin a single histogram bin
+   * @return the percentage of a single bin holding values in [lowerValue, higherValue].
+   */
+  private def getOccupation(
+      higherValue: Double,
+      lowerValue: Double,
+      bin: HistogramBin): Double = {
+    assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi)
+    if (bin.hi == bin.lo) {
+      // the entire bin is covered in the range
+      1.0
+    } else if (higherValue == lowerValue) {
+      // set percentage to 1/NDV
+      1.0 / bin.ndv.toDouble
+    } else {
+      // Use proration since the range falls inside this bin.
+      math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
+    }
+  }
+
+  /**
+   * Returns the number of bins for column values in [lowerValue, higherValue].
+   * The column value distribution is saved in an equi-height histogram.  The return values is a
+   * double value is because we may return a portion of a bin. For example, a predicate
+   * "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values.
+   *
+   * @param higherId id of the high end bin holding the high end value of a column range
+   * @param lowerId id of the low end bin holding the low end value of a column range
+   * @param higherEnd a given upper bound value of a specified column value range
+   * @param lowerEnd a given lower bound value of a specified column value range
+   * @param histogram a numeric equi-height histogram
+   * @return the number of bins for column values in [lowerEnd, higherEnd].
+   */
+  def getOccupationBins(
+      higherId: Int,
+      lowerId: Int,
+      higherEnd: Double,
+      lowerEnd: Double,
+      histogram: Histogram): Double = {
+    assert(lowerId <= higherId)
+
+    if (lowerId == higherId) {
+      val curBin = histogram.bins(lowerId)
+      getOccupation(higherEnd, lowerEnd, curBin)
+    } else {
+      // compute how much lowerEnd/higherEnd occupies its bin
+      val lowerCurBin = histogram.bins(lowerId)
+      val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin)
+
+      val higherCurBin = histogram.bins(higherId)
+      val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin)
+
+      // the total length is lowerPart + higherPart + bins between them
+      lowerPart + higherPart + higherId - lowerId - 1
+    }
+  }
+
 }
diff --git a/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, LeafNode, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.types._
 
@@ -265,7 +265,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
    * @param update a boolean flag to specify if we need to update ColumnStat of a given column
    *               for subsequent conditions
    * @return an optional double value to show the percentage of rows meeting a given condition
-    *         It returns None if no statistics exists for a given column or wrong value.
+   *         It returns None if no statistics exists for a given column or wrong value.
    */
   def evaluateBinary(
       op: BinaryComparison,
@@ -332,8 +332,44 @@ case class FilterEstimation(plan: Filter) extends Logging {
         colStatsMap.update(attr, newStats)
       }
 
-      Some(1.0 / BigDecimal(ndv))
-    } else {
+      if (colStat.histogram.isEmpty) {
+        // returns 1/ndv if there is no histogram
+        Some(1.0 / BigDecimal(ndv))
+      } else {
+        // We compute filter selectivity using Histogram information.
+        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+        val histogram = colStat.histogram.get
+        val hgmBins = histogram.bins
+
+        // find bins where column's current min and max locate.  Note that a column's [min, max]
+        // range may change due to another condition applied earlier.
+        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+        val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins)
+        val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins)
+
+        // compute how many bins the column's current valid range [min, max] occupies.
+        // Note that a column's [min, max] range may vary after we apply some filter conditions.
+        val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max,
+          min, histogram)
+
+        val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins)
+        val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins)
+        assert(lowerBinId <= higherBinId)
+        val lowerBinNdv = hgmBins(lowerBinId).ndv
+        val higherBinNdv = hgmBins(higherBinId).ndv
+        // assume uniform distribution in each bin
+        val occupiedBins = if (lowerBinId == higherBinId) {
+          1.0 / lowerBinNdv
+        } else {
+          (1.0 / lowerBinNdv) +   // lowest bin
+            (higherBinId - lowerBinId - 1) + // middle bins
+            (1.0 / higherBinNdv)  // highest bin
+        }
+        Some(occupiedBins / validRangeBins)
+      }
+
+    } else {  // not in interval
       Some(0.0)
     }
 
@@ -471,37 +507,46 @@ case class FilterEstimation(plan: Filter) extends Logging {
       percent = 1.0
     } else {
       // This is the partial overlap case:
-      // Without advanced statistics like histogram, we assume uniform data distribution.
-      // We just prorate the adjusted range over the initial range to compute filter selectivity.
-      assert(max > min)
-      percent = op match {
-        case _: LessThan =>
-          if (numericLiteral == max) {
-            // If the literal value is right on the boundary, we can minus the part of the
-            // boundary value (1/ndv).
-            1.0 - 1.0 / ndv
-          } else {
-            (numericLiteral - min) / (max - min)
-          }
-        case _: LessThanOrEqual =>
-          if (numericLiteral == min) {
-            // The boundary value is the only satisfying value.
-            1.0 / ndv
-          } else {
-            (numericLiteral - min) / (max - min)
-          }
-        case _: GreaterThan =>
-          if (numericLiteral == min) {
-            1.0 - 1.0 / ndv
-          } else {
-            (max - numericLiteral) / (max - min)
-          }
-        case _: GreaterThanOrEqual =>
-          if (numericLiteral == max) {
-            1.0 / ndv
-          } else {
-            (max - numericLiteral) / (max - min)
-          }
+
+      if (colStat.histogram.isEmpty) {
+        // Without advanced statistics like histogram, we assume uniform data distribution.
+        // We just prorate the adjusted range over the initial range to compute filter selectivity.
+        assert(max > min)
+        percent = op match {
+          case _: LessThan =>
+            if (numericLiteral == max) {
+              // If the literal value is right on the boundary, we can minus the part of the
+              // boundary value (1/ndv).
+              1.0 - 1.0 / ndv
+            } else {
+              (numericLiteral - min) / (max - min)
+            }
+          case _: LessThanOrEqual =>
+            if (numericLiteral == min) {
+              // The boundary value is the only satisfying value.
+              1.0 / ndv
+            } else {
+              (numericLiteral - min) / (max - min)
+            }
+          case _: GreaterThan =>
+            if (numericLiteral == min) {
+              1.0 - 1.0 / ndv
+            } else {
+              (max - numericLiteral) / (max - min)
+            }
+          case _: GreaterThanOrEqual =>
+            if (numericLiteral == max) {
+              1.0 / ndv
+            } else {
+              (max - numericLiteral) / (max - min)
+            }
+        }
+      } else {
+        val numericHistogram = colStat.histogram.get
+        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+        percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum)
       }
 
       if (update) {
@@ -513,10 +558,9 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
         op match {
           case _: GreaterThan | _: GreaterThanOrEqual =>
-            // If new ndv is 1, then new max must be equal to new min.
-            newMin = if (newNdv == 1) newMax else newValue
+            newMin = newValue
           case _: LessThan | _: LessThanOrEqual =>
-            newMax = if (newNdv == 1) newMin else newValue
+            newMax = newValue
         }
 
         val newStats =
@@ -529,6 +573,54 @@ case class FilterEstimation(plan: Filter) extends Logging {
     Some(percent)
   }
 
+  /**
+   * Returns the selectivity percentage for binary condition in the column's
+   * current valid range [min, max]
+   *
+   * @param op a binary comparison operator
+   * @param histogram a numeric equi-height histogram
+   * @param max the upper bound of the current valid range for a given column
+   * @param min the lower bound of the current valid range for a given column
+   * @param datumNumber the numeric value of a literal
+   * @return the selectivity percentage for a condition in the current range.
+   */
+
+  def computePercentByEquiHeightHgm(
+      op: BinaryComparison,
+      histogram: Histogram,
+      max: Double,
+      min: Double,
+      datumNumber: Double): Double = {
+    // find bins where column's current min and max locate.  Note that a column's [min, max]
+    // range may change due to another condition applied earlier.
+    val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins)
+    val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
+
+    // compute how many bins the column's current valid range [min, max] occupies.
+    // Note that a column's [min, max] range may vary after we apply some filter conditions.
+    val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram)
+
+    val datumInBinId = op match {
+      case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
+        EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
+      case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
+        EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
+    }
+
+    op match {
+      // LessThan and LessThanOrEqual share the same logic,
+      // but their datumInBinId may be different
+      case LessThan(_, _) | LessThanOrEqual(_, _) =>
+        EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min,
+          histogram) / minToMaxLength
+      // GreaterThan and GreaterThanOrEqual share the same logic,
+      // but their datumInBinId may be different
+      case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
+        EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber,
+          histogram) / minToMaxLength
+    }
+  }
+
   /**
    * Returns a percentage of rows meeting a binary comparison expression containing two columns.
    * In SQL queries, we also see predicate expressions involving two columns
@@ -784,11 +876,16 @@ case class ColumnStatsMap(originalMap: AttributeMap[ColumnStat]) {
   def outputColumnStats(rowsBeforeFilter: BigInt, rowsAfterFilter: BigInt)
     : AttributeMap[ColumnStat] = {
     val newColumnStats = originalMap.map { case (attr, oriColStat) =>
-      // Update ndv based on the overall filter selectivity: scale down ndv if the number of rows
-      // decreases; otherwise keep it unchanged.
-      val newNdv = EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
-        newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount)
       val colStat = updatedMap.get(attr.exprId).map(_._2).getOrElse(oriColStat)
+      val newNdv = if (colStat.distinctCount > 1) {
+        // Update ndv based on the overall filter selectivity: scale down ndv if the number of rows
+        // decreases; otherwise keep it unchanged.
+        EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
+          newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount)
+      } else {
+        // no need to scale down since it is already down to 1 (for skewed distribution case)
+        colStat.distinctCount
+      }
       attr -> colStat.copy(distinctCount = newNdv)
     }
     AttributeMap(newColumnStats.toSeq)