Skip to content

Commit

Permalink
KYLIN-3656 Improve HLLCounter performance
Browse files Browse the repository at this point in the history
  • Loading branch information
baibaichen authored and shaofengshi committed Nov 15, 2018
1 parent f0b26c4 commit 889dc50
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 8 deletions.
Expand Up @@ -35,6 +35,11 @@ public DenseRegister(int p) {
this.register = new byte[m];
}

public void copyFrom(DenseRegister r){
assert m == r.m;
System.arraycopy(r.register, 0, register, 0 , register.length);
}

public void set(int pos, byte value) {
register[pos] = value;
}
Expand Down
Expand Up @@ -32,6 +32,14 @@
@SuppressWarnings("serial")
public class HLLCounter implements Serializable, Comparable<HLLCounter> {

static double[] harmonicMean;

static {
harmonicMean = new double[256];
for (int i = 1; i < 256; i++)
harmonicMean[i] = 1.0 / (1L << i);
}

// not final for test purpose
static double OVERFLOW_FACTOR = 0.01;

Expand All @@ -57,7 +65,11 @@ public HLLCounter(int p, HashFunction hashFunc) {

public HLLCounter(HLLCounter another) {
this(another.p, another.getRegisterType(), another.hashFunc);
merge(another);
if(another.getRegisterType() == RegisterType.DENSE){
((DenseRegister)register).copyFrom((DenseRegister)another.register);
}else {
merge(another);
}
}

public HLLCounter(int p, RegisterType type) {
Expand Down Expand Up @@ -202,6 +214,8 @@ public static class HLLCSnapshot {
int zeroBuckets;

public HLLCSnapshot(HLLCounter hllc) {
int[] registerNums = new int[256];

p = (byte) hllc.p;
registerSum = 0;
zeroBuckets = 0;
Expand All @@ -215,14 +229,14 @@ public HLLCSnapshot(HLLCounter hllc) {
dr = (DenseRegister) register;
}
byte[] registers = dr.getRawRegister();
for (int i = 0; i < hllc.m; i++) {
if (registers[i] == 0) {
registerSum++;
zeroBuckets++;
} else {
registerSum += 1.0 / (1L << registers[i]);
}
for (int i = 0; i < hllc.m; i ++) {
registerNums[registers[i]] ++;
}
zeroBuckets = registerNums[0];
for (int i= 1; i < 256; i ++)
registerSum += registerNums[i] * harmonicMean[i];

registerSum += zeroBuckets;
}

public long getCountEstimate() {
Expand Down
Expand Up @@ -22,6 +22,7 @@

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
Expand Down Expand Up @@ -68,6 +69,88 @@ public void tesSparseEstimate() throws IOException {
assertTrue(hllc.getCountEstimate() > 10 * 0.9);
}

/**
* evaluation getCountEstimate of HLLCounter
* cost time : 1341[old] -> 206[new]
*/
@Test
public void countPerformanceWithLargeCardinality(){
int cardinality = 10_000_000;
HLLCounter hllc = generateTestCounter(2009, cardinality);
final int testCount = 5000;
countEstimatePerformance(hllc, cardinality, testCount);
}

/**
* evaluation getCountEstimate of HLLCounter
* cost time : 1396[old] -> 274[new]
*/
@Test
public void countPerformanceSmallCardinality(){
int cardinality = 300_000;
HLLCounter hllc = generateTestCounter(2009, cardinality);
final int testCount = 5000;
countEstimatePerformance(hllc, cardinality, testCount);
}

/**
* evaluation constructor of HLLCounter
* cost time : 1577[old] -> 490[new]
*/
@Test
public void createHLLCPerformance(){
int cardinality = 3_000_000;
HLLCounter hllc = generateTestCounter(2009, cardinality);
final int testCount = 30000;

HLLCounter hllc2 = null;
long start = System.currentTimeMillis();
for (int i = 0; i < testCount; i++){
hllc2 = new HLLCounter(hllc);
}
long totalTime = System.currentTimeMillis() - start;
System.out.println("constructor of HLLCounter cost time : " + totalTime);

long estimate = hllc2.getCountEstimate();
assertTrue(estimate > 0.9 * cardinality && estimate < 1.1 * cardinality);
System.out.println("estimate is " + estimate);
}

/**
* Simulate mutli call to method [getCountEstimate of HLLCounter] and get
* duration for method getCountEstimate of HLLCounter
*/
private void countEstimatePerformance(HLLCounter hllc, int realCount, int callTimes) {
long start = System.currentTimeMillis();
for (int i = 0; i < callTimes; i++)
hllc.getCountEstimate();
long totalTime = System.currentTimeMillis() - start;
System.out.println("getCountEstimate of HLLCounter cost time : " + totalTime);

long estimate = hllc.getCountEstimate();
assertTrue(estimate > realCount * 0.9 && estimate < realCount * 1.1);
System.out.println("estimate is " + estimate);
}

private HLLCounter generateTestCounter(int seed, int maxDistinctCounts) {
long start = System.currentTimeMillis();
Random rand1 = new Random(seed);
Set<Integer> rawData = new HashSet<>();
while (rawData.size() < maxDistinctCounts)
rawData.add(rand1.nextInt());
ArrayList<Integer> testData = new ArrayList<>(rawData);
assertEquals(maxDistinctCounts, testData.size());

HLLCounter hllc = new HLLCounter(16, RegisterType.DENSE);

for (int j = 0; j < testData.size(); j++) {
hllc.add(testData.get(j));
}
long totalTime = System.currentTimeMillis() - start;
System.out.println("generate data cost time : " + totalTime);
return hllc;
}

@Test
public void countTest() throws IOException {
int n = 10;
Expand Down

0 comments on commit 889dc50

Please sign in to comment.