Skip to content

Commit

Permalink
restore legacy ReduceOperations.approxUniques; rename bijection-aware…
Browse files Browse the repository at this point in the history
… method to approximateUniqueCount
  • Loading branch information
Aaron Siegel committed Feb 20, 2013
1 parent a8bb6ea commit a2cac76
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion src/main/scala/com/twitter/scalding/ReduceOperations.scala
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,23 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ
* 0.25% error ~ 256kB
* }}}
*/
def approxUniques[T <% Array[Byte]](f : (Fields, Fields), errPercent : Double = 1.0) = {
def approximateUniqueCount[T <% Array[Byte]](f : (Fields, Fields), errPercent : Double = 1.0) = {
hyperLogLogMap[T,Double](f, errPercent) { _.estimatedSize }
}

def hyperLogLog[T <% Array[Byte]](f : (Fields, Fields), errPercent : Double = 1.0) = {
hyperLogLogMap[T,HLL](f, errPercent) { hll => hll }
}

@deprecated("use of approximateUniqueCount is preferred.", "0.8.3")
def approxUniques(f : (Fields, Fields), errPercent : Double = 1.0) = {
// Legacy (pre-bijection) approximate unique count that uses in.toString.getBytes to
// obtain a long hash code. We specify the kludgy CTuple => Array[Byte] bijection
// explicitly.
hyperLogLogMap[CTuple,Double](f, errPercent) { _.estimatedSize }
{ (in : CTuple) => in.toString.getBytes("UTF-8") }
}

private[this] def hyperLogLogMap[T <% Array[Byte],U](f : (Fields, Fields), errPercent : Double = 1.0)(fn : HLL => U) = {
//bits = log(m) == 2 *log(104/errPercent) = 2log(104) - 2*log(errPercent)
def log2(x : Double) = scala.math.log(x)/scala.math.log(2.0)
Expand Down

0 comments on commit a2cac76

Please sign in to comment.