From 75cb0887deb9e5d27b7d6e5fa1129df4a953c641 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Tue, 6 Sep 2016 14:53:49 +0100 Subject: [PATCH 1/2] Actually call compress() in QuantileSummaries, and avoid expensive ArrayBuffer.prepend --- .../spark/sql/catalyst/util/QuantileSummaries.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala index 7512ace188569..e23123254ab9f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.util -import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{ArrayBuffer, ListBuffer} import org.apache.spark.sql.catalyst.util.QuantileSummaries.Stats @@ -61,7 +61,12 @@ class QuantileSummaries( def insert(x: Double): QuantileSummaries = { headSampled += x if (headSampled.size >= defaultHeadSize) { - this.withHeadBufferInserted + val result = this.withHeadBufferInserted + if (result.sampled.length >= compressThreshold) { + result.compress() + } else { + result + } } else { this } @@ -236,7 +241,7 @@ object QuantileSummaries { if (currentSamples.isEmpty) { return Array.empty[Stats] } - val res: ArrayBuffer[Stats] = ArrayBuffer.empty + val res = ListBuffer.empty[Stats] // Start for the last element, which is always part of the set. // The head contains the current new head, that may be merged with the current element. var head = currentSamples.last From 86afd440f04984f6413da70b5a53322e0167d22c Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Wed, 7 Sep 2016 14:04:59 -0700 Subject: [PATCH 2/2] work --- .../sql/catalyst/util/QuantileSummaries.scala | 12 ++++++-- .../util/QuantileSummariesSuite.scala | 29 +++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala index 493b5faf9e50a..02e0f40b021fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala @@ -60,11 +60,16 @@ class QuantileSummaries( */ def insert(x: Double): QuantileSummaries = { headSampled.append(x) - if (headSampled.size >= defaultHeadSize) { + val withInsertion = if (headSampled.size >= defaultHeadSize) { this.withHeadBufferInserted } else { this } + if (withInsertion.sampled.length >= compressThreshold) { + withInsertion.compress() + } else { + withInsertion + } } /** @@ -258,7 +263,10 @@ object QuantileSummaries { } res.prepend(head) // If necessary, add the minimum element: - res.prepend(currentSamples.head) + val currHead = currentSamples.head + if (currHead.value < head.value) { + res.prepend(currentSamples.head) + } res.toArray } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala index 89b2a22a3de45..5e90970b1bb2e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/QuantileSummariesSuite.scala @@ -40,6 +40,20 @@ class QuantileSummariesSuite extends SparkFunSuite { summary.compress() } + /** + * Interleaves compression and insertions. + */ + private def buildCompressSummary( + data: Seq[Double], + epsi: Double, + threshold: Int): QuantileSummaries = { + var summary = new QuantileSummaries(threshold, epsi) + data.foreach { x => + summary = summary.insert(x).compress() + } + summary + } + private def checkQuantile(quant: Double, data: Seq[Double], summary: QuantileSummaries): Unit = { val approx = summary.query(quant) // The rank of the approximation. @@ -54,8 +68,8 @@ class QuantileSummariesSuite extends SparkFunSuite { for { (seq_name, data) <- Seq(increasing, decreasing, random) - epsi <- Seq(0.1, 0.0001) - compression <- Seq(1000, 10) + epsi <- Seq(0.1, 0.0001) // With a significant value and with full precision + compression <- Seq(1000, 10) // This interleaves n so that we test without and with compression } { test(s"Extremas with epsi=$epsi and seq=$seq_name, compression=$compression") { @@ -75,6 +89,17 @@ class QuantileSummariesSuite extends SparkFunSuite { checkQuantile(0.1, data, s) checkQuantile(0.001, data, s) } + + test(s"Some quantile values with epsi=$epsi and seq=$seq_name, compression=$compression " + + s"(interleaved)") { + val s = buildCompressSummary(data, epsi, compression) + assert(s.count == data.size, s"Found count=${s.count} but data size=${data.size}") + checkQuantile(0.9999, data, s) + checkQuantile(0.9, data, s) + checkQuantile(0.5, data, s) + checkQuantile(0.1, data, s) + checkQuantile(0.001, data, s) + } } // Tests for merging procedure