Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KYLIN-3656 Improve HLLCounter performance #345

Merged
merged 1 commit into from Nov 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -35,6 +35,11 @@ public DenseRegister(int p) {
this.register = new byte[m];
}

public void copyFrom(DenseRegister r){
assert m == r.m;
System.arraycopy(r.register, 0, register, 0 , register.length);
}

public void set(int pos, byte value) {
register[pos] = value;
}
Expand Down
Expand Up @@ -32,6 +32,14 @@
@SuppressWarnings("serial")
public class HLLCounter implements Serializable, Comparable<HLLCounter> {

static double[] harmonicMean;

static {
harmonicMean = new double[256];
for (int i = 1; i < 256; i++)
harmonicMean[i] = 1.0 / (1L << i);
}

// not final for test purpose
static double OVERFLOW_FACTOR = 0.01;

Expand All @@ -57,7 +65,11 @@ public HLLCounter(int p, HashFunction hashFunc) {

public HLLCounter(HLLCounter another) {
this(another.p, another.getRegisterType(), another.hashFunc);
merge(another);
if(another.getRegisterType() == RegisterType.DENSE){
((DenseRegister)register).copyFrom((DenseRegister)another.register);
}else {
merge(another);
}
}

public HLLCounter(int p, RegisterType type) {
Expand Down Expand Up @@ -202,6 +214,8 @@ public static class HLLCSnapshot {
int zeroBuckets;

public HLLCSnapshot(HLLCounter hllc) {
int[] registerNums = new int[256];

p = (byte) hllc.p;
registerSum = 0;
zeroBuckets = 0;
Expand All @@ -215,14 +229,14 @@ public HLLCSnapshot(HLLCounter hllc) {
dr = (DenseRegister) register;
}
byte[] registers = dr.getRawRegister();
for (int i = 0; i < hllc.m; i++) {
if (registers[i] == 0) {
registerSum++;
zeroBuckets++;
} else {
registerSum += 1.0 / (1L << registers[i]);
}
for (int i = 0; i < hllc.m; i ++) {
registerNums[registers[i]] ++;
}
zeroBuckets = registerNums[0];
for (int i= 1; i < 256; i ++)
registerSum += registerNums[i] * harmonicMean[i];

registerSum += zeroBuckets;
}

public long getCountEstimate() {
Expand Down
Expand Up @@ -22,6 +22,7 @@

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
Expand Down Expand Up @@ -68,6 +69,88 @@ public void tesSparseEstimate() throws IOException {
assertTrue(hllc.getCountEstimate() > 10 * 0.9);
}

/**
* evaluation getCountEstimate of HLLCounter
* cost time : 1341[old] -> 206[new]
*/
@Test
public void countPerformanceWithLargeCardinality(){
int cardinality = 10_000_000;
HLLCounter hllc = generateTestCounter(2009, cardinality);
final int testCount = 5000;
countEstimatePerformance(hllc, cardinality, testCount);
}

/**
* evaluation getCountEstimate of HLLCounter
* cost time : 1396[old] -> 274[new]
*/
@Test
public void countPerformanceSmallCardinality(){
int cardinality = 300_000;
HLLCounter hllc = generateTestCounter(2009, cardinality);
final int testCount = 5000;
countEstimatePerformance(hllc, cardinality, testCount);
}

/**
* evaluation constructor of HLLCounter
* cost time : 1577[old] -> 490[new]
*/
@Test
public void createHLLCPerformance(){
int cardinality = 3_000_000;
HLLCounter hllc = generateTestCounter(2009, cardinality);
final int testCount = 30000;

HLLCounter hllc2 = null;
long start = System.currentTimeMillis();
for (int i = 0; i < testCount; i++){
hllc2 = new HLLCounter(hllc);
}
long totalTime = System.currentTimeMillis() - start;
System.out.println("constructor of HLLCounter cost time : " + totalTime);

long estimate = hllc2.getCountEstimate();
assertTrue(estimate > 0.9 * cardinality && estimate < 1.1 * cardinality);
System.out.println("estimate is " + estimate);
}

/**
* Simulate mutli call to method [getCountEstimate of HLLCounter] and get
* duration for method getCountEstimate of HLLCounter
*/
private void countEstimatePerformance(HLLCounter hllc, int realCount, int callTimes) {
long start = System.currentTimeMillis();
for (int i = 0; i < callTimes; i++)
hllc.getCountEstimate();
long totalTime = System.currentTimeMillis() - start;
System.out.println("getCountEstimate of HLLCounter cost time : " + totalTime);

long estimate = hllc.getCountEstimate();
assertTrue(estimate > realCount * 0.9 && estimate < realCount * 1.1);
System.out.println("estimate is " + estimate);
}

private HLLCounter generateTestCounter(int seed, int maxDistinctCounts) {
long start = System.currentTimeMillis();
Random rand1 = new Random(seed);
Set<Integer> rawData = new HashSet<>();
while (rawData.size() < maxDistinctCounts)
rawData.add(rand1.nextInt());
ArrayList<Integer> testData = new ArrayList<>(rawData);
assertEquals(maxDistinctCounts, testData.size());

HLLCounter hllc = new HLLCounter(16, RegisterType.DENSE);

for (int j = 0; j < testData.size(); j++) {
hllc.add(testData.get(j));
}
long totalTime = System.currentTimeMillis() - start;
System.out.println("generate data cost time : " + totalTime);
return hllc;
}

@Test
public void countTest() throws IOException {
int n = 10;
Expand Down