From ebcd6d107087cf024362b3a901514cfc12c3dbab Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 12 Dec 2017 22:36:02 +0800
Subject: [PATCH 1/3] code cleanup

---
 .../statsEstimation/EstimationUtils.scala     | 105 ++++++------
 .../statsEstimation/FilterEstimation.scala    | 152 +++++++++---------
 2 files changed, 133 insertions(+), 124 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 2f416f2af91f1..f2bc8be9106fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -115,14 +115,10 @@ object EstimationUtils {
   }
 
   /**
-   * Returns the number of the first bin into which a column value falls for a specified
+   * Returns the index of the first bin into which the given value falls for a specified
    * numeric equi-height histogram.
-   *
-   * @param value a literal value of a column
-   * @param bins an array of bins for a given numeric equi-height histogram
-   * @return the id of the first bin into which a column value falls.
    */
-  def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+  private def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
     var i = 0
     while ((i < bins.length) && (value > bins(i).hi)) {
       i += 1
@@ -131,14 +127,10 @@ object EstimationUtils {
   }
 
   /**
-   * Returns the number of the last bin into which a column value falls for a specified
+   * Returns the index of the last bin into which the given value falls for a specified
    * numeric equi-height histogram.
-   *
-   * @param value a literal value of a column
-   * @param bins an array of bins for a given numeric equi-height histogram
-   * @return the id of the last bin into which a column value falls.
    */
-  def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+  private def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
     var i = bins.length - 1
     while ((i >= 0) && (value < bins(i).lo)) {
       i -= 1
@@ -147,65 +139,78 @@ object EstimationUtils {
   }
 
   /**
-   * Returns a percentage of a bin holding values for column value in the range of
-   * [lowerValue, higherValue]
-   *
-   * @param higherValue a given upper bound value of a specified column value range
-   * @param lowerValue a given lower bound value of a specified column value range
-   * @param bin a single histogram bin
-   * @return the percentage of a single bin holding values in [lowerValue, higherValue].
+   * Returns the possibility of the given histogram bin holding values within the given range
+   * [lowerBound, upperBound].
    */
-  private def getOccupation(
-      higherValue: Double,
-      lowerValue: Double,
+  private def binHoldingRangePossibility(
+      upperBound: Double,
+      lowerBound: Double,
       bin: HistogramBin): Double = {
-    assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi)
+    assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bin.hi)
     if (bin.hi == bin.lo) {
       // the entire bin is covered in the range
       1.0
-    } else if (higherValue == lowerValue) {
+    } else if (upperBound == lowerBound) {
       // set percentage to 1/NDV
       1.0 / bin.ndv.toDouble
     } else {
       // Use proration since the range falls inside this bin.
-      math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
+      math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0)
     }
   }
 
   /**
-   * Returns the number of bins for column values in [lowerValue, higherValue].
-   * The column value distribution is saved in an equi-height histogram.  The return values is a
-   * double value is because we may return a portion of a bin. For example, a predicate
-   * "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values.
+   * Returns the number of histogram bins holding values within the given range
+   * [lowerBound, upperBound].
    *
-   * @param higherId id of the high end bin holding the high end value of a column range
-   * @param lowerId id of the low end bin holding the low end value of a column range
-   * @param higherEnd a given upper bound value of a specified column value range
-   * @param lowerEnd a given lower bound value of a specified column value range
+   * Note that the return value is double type, because the range boundaries usually occupy a
+   * portion of a bin. An extrema case is [value, value] which is generated by equal predicate
+   * `col = value`, we can get more accuracy by allowing returning portion of histogram bins.
+   *
+   * @param upperBound the highest value of the given range
+   * @param upperBoundInclusive whether the upperBound is included in the range
+   * @param lowerBound the lowest value of the given range
+   * @param lowerBoundInclusive whether the lowerBound is included in the range
    * @param histogram a numeric equi-height histogram
-   * @return the number of bins for column values in [lowerEnd, higherEnd].
    */
-  def getOccupationBins(
-      higherId: Int,
-      lowerId: Int,
-      higherEnd: Double,
-      lowerEnd: Double,
+  def numBinsHoldingRange(
+      upperBound: Double,
+      upperBoundInclusive: Boolean,
+      lowerBound: Double,
+      lowerBoundInclusive: Boolean,
       histogram: Histogram): Double = {
-    assert(lowerId <= higherId)
+    assert(histogram.bins.head.lo <= lowerBound &&
+      lowerBound <= upperBound &&
+      upperBound <= histogram.bins.last.hi,
+      "Given range does not fit in the given histogram.")
+    assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive,
+      s"'$lowerBound < value < $upperBound' is an invalid range.")
+
+    val upperBinIndex = if (upperBoundInclusive) {
+      findLastBinForValue(upperBound, histogram.bins)
+    } else {
+      findFirstBinForValue(upperBound, histogram.bins)
+    }
+    val lowerBinIndex = if (lowerBoundInclusive) {
+      findFirstBinForValue(lowerBound, histogram.bins)
+    } else {
+      findLastBinForValue(lowerBound, histogram.bins)
+    }
+    assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.")
+
 
-    if (lowerId == higherId) {
-      val curBin = histogram.bins(lowerId)
-      getOccupation(higherEnd, lowerEnd, curBin)
+    if (lowerBinIndex == upperBinIndex) {
+      binHoldingRangePossibility(upperBound, lowerBound, histogram.bins(lowerBinIndex))
     } else {
-      // compute how much lowerEnd/higherEnd occupies its bin
-      val lowerCurBin = histogram.bins(lowerId)
-      val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin)
+      // Computes the occupied portion of bins of the upperBound and lowerBound.
+      val lowerBin = histogram.bins(lowerBinIndex)
+      val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin)
 
-      val higherCurBin = histogram.bins(higherId)
-      val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin)
+      val higherBin = histogram.bins(upperBinIndex)
+      val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin)
 
-      // the total length is lowerPart + higherPart + bins between them
-      lowerPart + higherPart + higherId - lowerId - 1
+      // The total number of bins is lowerPart + higherPart + bins between them
+      lowerPart + higherPart + upperBinIndex - lowerBinIndex - 1
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index f52a15edbfe4a..c2170060fbb2a 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging {
         // returns 1/ndv if there is no histogram
         Some(1.0 / BigDecimal(ndv))
       } else {
-        // We compute filter selectivity using Histogram information.
-        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
-        val histogram = colStat.histogram.get
-        val hgmBins = histogram.bins
-
-        // find bins where column's current min and max locate.  Note that a column's [min, max]
-        // range may change due to another condition applied earlier.
-        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
-        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
-        val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins)
-        val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins)
-
-        // compute how many bins the column's current valid range [min, max] occupies.
-        // Note that a column's [min, max] range may vary after we apply some filter conditions.
-        val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max,
-          min, histogram)
-
-        val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins)
-        val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins)
-        assert(lowerBinId <= higherBinId)
-        val lowerBinNdv = hgmBins(lowerBinId).ndv
-        val higherBinNdv = hgmBins(higherBinId).ndv
-        // assume uniform distribution in each bin
-        val occupiedBins = if (lowerBinId == higherBinId) {
-          1.0 / lowerBinNdv
-        } else {
-          (1.0 / lowerBinNdv) +   // lowest bin
-            (higherBinId - lowerBinId - 1) + // middle bins
-            (1.0 / higherBinNdv)  // highest bin
-        }
-        Some(occupiedBins / validRangeBins)
+        Some(computeEqualityPossibilityByHistogram(literal, colStat))
       }
 
     } else {  // not in interval
       Some(0.0)
     }
-
   }
 
   /**
@@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
             }
         }
       } else {
-        val numericHistogram = colStat.histogram.get
-        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
-        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
-        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
-        percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum)
+        percent = computeComparisonPossibilityByHistogram(op, literal, colStat)
       }
 
       if (update) {
@@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging {
   }
 
   /**
-   * Returns the selectivity percentage for binary condition in the column's
-   * current valid range [min, max]
-   *
-   * @param op a binary comparison operator
-   * @param histogram a numeric equi-height histogram
-   * @param max the upper bound of the current valid range for a given column
-   * @param min the lower bound of the current valid range for a given column
-   * @param datumNumber the numeric value of a literal
-   * @return the selectivity percentage for a condition in the current range.
+   * Computes the possibility of a equal predicate using histogram.
    */
+  private def computeEqualityPossibilityByHistogram(
+      literal: Literal, colStat: ColumnStat): Double = {
+    val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+    val histogram = colStat.histogram.get
 
-  def computePercentByEquiHeightHgm(
-      op: BinaryComparison,
-      histogram: Histogram,
-      max: Double,
-      min: Double,
-      datumNumber: Double): Double = {
     // find bins where column's current min and max locate.  Note that a column's [min, max]
     // range may change due to another condition applied earlier.
-    val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins)
-    val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
+    val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+    val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
 
     // compute how many bins the column's current valid range [min, max] occupies.
-    // Note that a column's [min, max] range may vary after we apply some filter conditions.
-    val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram)
-
-    val datumInBinId = op match {
-      case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
-        EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
-      case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
-        EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
-    }
+    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+      upperBound = max,
+      upperBoundInclusive = true,
+      lowerBound = min,
+      lowerBoundInclusive = true,
+      histogram)
+
+    val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
+      upperBound = datum,
+      upperBoundInclusive = true,
+      lowerBound = datum,
+      lowerBoundInclusive = true,
+      histogram)
+
+    numBinsHoldingDatum / numBinsHoldingEntireRange
+  }
 
-    op match {
-      // LessThan and LessThanOrEqual share the same logic,
-      // but their datumInBinId may be different
-      case LessThan(_, _) | LessThanOrEqual(_, _) =>
-        EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min,
-          histogram) / minToMaxLength
-      // GreaterThan and GreaterThanOrEqual share the same logic,
-      // but their datumInBinId may be different
-      case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
-        EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber,
-          histogram) / minToMaxLength
+  /**
+   * Computes the possibility of a comparison predicate using histogram.
+   */
+  private def computeComparisonPossibilityByHistogram(
+      op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = {
+    val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+    val histogram = colStat.histogram.get
+
+    // find bins where column's current min and max locate.  Note that a column's [min, max]
+    // range may change due to another condition applied earlier.
+    val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+    val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+
+    // compute how many bins the column's current valid range [min, max] occupies.
+    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+      max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram)
+
+    val numBinsHoldingDatum = op match {
+      // LessThan and LessThanOrEqual share the same logic, the only difference is whether to
+      // include the upperBound in the range.
+      case _: LessThan =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = datum,
+          upperBoundInclusive = false,
+          lowerBound = min,
+          lowerBoundInclusive = true,
+          histogram)
+      case _: LessThanOrEqual =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = datum,
+          upperBoundInclusive = true,
+          lowerBound = min,
+          lowerBoundInclusive = true,
+          histogram)
+
+      // GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to
+      // include the lowerBound in the range.
+      case _: GreaterThan =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = max,
+          upperBoundInclusive = true,
+          lowerBound = datum,
+          lowerBoundInclusive = false,
+          histogram)
+      case _: GreaterThanOrEqual =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = max,
+          upperBoundInclusive = true,
+          lowerBound = datum,
+          lowerBoundInclusive = true,
+          histogram)
     }
+
+    numBinsHoldingDatum / numBinsHoldingEntireRange
   }
 
   /**

From 8fe0c4991b90781a7017de4938705bbc32244dc6 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 13 Dec 2017 10:51:35 +0800
Subject: [PATCH 2/3] address comments

---
 .../statsEstimation/EstimationUtils.scala     | 26 +++++++++----------
 .../statsEstimation/FilterEstimation.scala    | 20 +++++++-------
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index f2bc8be9106fe..484ad2beffbaa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -163,50 +163,48 @@ object EstimationUtils {
    * Returns the number of histogram bins holding values within the given range
    * [lowerBound, upperBound].
    *
-   * Note that the return value is double type, because the range boundaries usually occupy a
+   * Note that the returned value is double type, because the range boundaries usually occupy a
    * portion of a bin. An extrema case is [value, value] which is generated by equal predicate
-   * `col = value`, we can get more accuracy by allowing returning portion of histogram bins.
+   * `col = value`, we can get higher accuracy by allowing returning portion of histogram bins.
    *
    * @param upperBound the highest value of the given range
    * @param upperBoundInclusive whether the upperBound is included in the range
    * @param lowerBound the lowest value of the given range
    * @param lowerBoundInclusive whether the lowerBound is included in the range
-   * @param histogram a numeric equi-height histogram
+   * @param bins an array of bins for a given numeric equi-height histogram
    */
   def numBinsHoldingRange(
       upperBound: Double,
       upperBoundInclusive: Boolean,
       lowerBound: Double,
       lowerBoundInclusive: Boolean,
-      histogram: Histogram): Double = {
-    assert(histogram.bins.head.lo <= lowerBound &&
-      lowerBound <= upperBound &&
-      upperBound <= histogram.bins.last.hi,
+      bins: Array[HistogramBin]): Double = {
+    assert(bins.head.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bins.last.hi,
       "Given range does not fit in the given histogram.")
     assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive,
       s"'$lowerBound < value < $upperBound' is an invalid range.")
 
     val upperBinIndex = if (upperBoundInclusive) {
-      findLastBinForValue(upperBound, histogram.bins)
+      findLastBinForValue(upperBound, bins)
     } else {
-      findFirstBinForValue(upperBound, histogram.bins)
+      findFirstBinForValue(upperBound, bins)
     }
     val lowerBinIndex = if (lowerBoundInclusive) {
-      findFirstBinForValue(lowerBound, histogram.bins)
+      findFirstBinForValue(lowerBound, bins)
     } else {
-      findLastBinForValue(lowerBound, histogram.bins)
+      findLastBinForValue(lowerBound, bins)
     }
     assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.")
 
 
     if (lowerBinIndex == upperBinIndex) {
-      binHoldingRangePossibility(upperBound, lowerBound, histogram.bins(lowerBinIndex))
+      binHoldingRangePossibility(upperBound, lowerBound, bins(lowerBinIndex))
     } else {
       // Computes the occupied portion of bins of the upperBound and lowerBound.
-      val lowerBin = histogram.bins(lowerBinIndex)
+      val lowerBin = bins(lowerBinIndex)
       val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin)
 
-      val higherBin = histogram.bins(upperBinIndex)
+      val higherBin = bins(upperBinIndex)
       val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin)
 
       // The total number of bins is lowerPart + higherPart + bins between them
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index c2170060fbb2a..850dd1ba724a0 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -539,7 +539,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
   }
 
   /**
-   * Computes the possibility of a equal predicate using histogram.
+   * Computes the possibility of an equality predicate using histogram.
    */
   private def computeEqualityPossibilityByHistogram(
       literal: Literal, colStat: ColumnStat): Double = {
@@ -557,14 +557,14 @@ case class FilterEstimation(plan: Filter) extends Logging {
       upperBoundInclusive = true,
       lowerBound = min,
       lowerBoundInclusive = true,
-      histogram)
+      histogram.bins)
 
     val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
       upperBound = datum,
       upperBoundInclusive = true,
       lowerBound = datum,
       lowerBoundInclusive = true,
-      histogram)
+      histogram.bins)
 
     numBinsHoldingDatum / numBinsHoldingEntireRange
   }
@@ -584,9 +584,9 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
     // compute how many bins the column's current valid range [min, max] occupies.
     val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
-      max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram)
+      max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram.bins)
 
-    val numBinsHoldingDatum = op match {
+    val numBinsHoldingRange = op match {
       // LessThan and LessThanOrEqual share the same logic, the only difference is whether to
       // include the upperBound in the range.
       case _: LessThan =>
@@ -595,14 +595,14 @@ case class FilterEstimation(plan: Filter) extends Logging {
           upperBoundInclusive = false,
           lowerBound = min,
           lowerBoundInclusive = true,
-          histogram)
+          histogram.bins)
       case _: LessThanOrEqual =>
         EstimationUtils.numBinsHoldingRange(
           upperBound = datum,
           upperBoundInclusive = true,
           lowerBound = min,
           lowerBoundInclusive = true,
-          histogram)
+          histogram.bins)
 
       // GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to
       // include the lowerBound in the range.
@@ -612,17 +612,17 @@ case class FilterEstimation(plan: Filter) extends Logging {
           upperBoundInclusive = true,
           lowerBound = datum,
           lowerBoundInclusive = false,
-          histogram)
+          histogram.bins)
       case _: GreaterThanOrEqual =>
         EstimationUtils.numBinsHoldingRange(
           upperBound = max,
           upperBoundInclusive = true,
           lowerBound = datum,
           lowerBoundInclusive = true,
-          histogram)
+          histogram.bins)
     }
 
-    numBinsHoldingDatum / numBinsHoldingEntireRange
+    numBinsHoldingRange / numBinsHoldingEntireRange
   }
 
   /**

From 4e35c43957cf27b105c8f6b8ff19621aac540098 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 13 Dec 2017 11:38:31 +0800
Subject: [PATCH 3/3] fix typo

---
 .../plans/logical/statsEstimation/EstimationUtils.scala         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 484ad2beffbaa..6f868cbd072c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -164,7 +164,7 @@ object EstimationUtils {
    * [lowerBound, upperBound].
    *
    * Note that the returned value is double type, because the range boundaries usually occupy a
-   * portion of a bin. An extrema case is [value, value] which is generated by equal predicate
+   * portion of a bin. An extreme case is [value, value] which is generated by equal predicate
    * `col = value`, we can get higher accuracy by allowing returning portion of histogram bins.
    *
    * @param upperBound the highest value of the given range