Skip to content

Commit

Permalink
improved comments for how numDV is computed from HLL
Browse files Browse the repository at this point in the history
  • Loading branch information
asolimando committed Sep 13, 2022
1 parent d46547c commit e2fcd42
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true);

// the aggregation does not update hll, only numNDVs is, it keeps the first hll
// the aggregation does not update hll, only numDVs is, it keeps the first hll
// notice that numDVs is computed by using HLL, it can detect that 'DATE_3' appears twice
ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(7)
.low(DATE_1).high(DATE_7).hll(values1).build();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
List<String> partitions = Arrays.asList("part1", "part2", "part3");

ColumnStatisticsData data1 = new ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(3)
.low(ONE).high(TWO).hll(1, 2, 3).build();
.low(ONE).high(THREE).hll(1, 2, 3).build();
ColumnStatisticsData data2 = new ColStatsBuilder<>(Decimal.class).numNulls(2).numDVs(3)
.low(THREE).high(FIVE).hll(3, 4, 5).build();
ColumnStatisticsData data3 = new ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(2)
Expand All @@ -144,7 +144,8 @@ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
DecimalColumnStatsAggregator aggregator = new DecimalColumnStatsAggregator();
ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true);

// the aggregation does not update hll, only numNDVs is, it keeps the first hll
// the aggregation does not update hll, only numDVs is, it keeps the first hll
// notice that numDVs is computed by using HLL, it can detect that '3' appears twice
ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Decimal.class).numNulls(6).numDVs(7)
.low(ONE).high(SEVEN).hll(1, 2, 3).build();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true);

// the aggregation does not update hll, only numNDVs is, it keeps the first hll
// the aggregation does not update hll, only numDVs is, it keeps the first hll
// notice that numDVs is computed by using HLL, it can detect that '3' appears twice
ColumnStatisticsData expectedStats = new ColStatsBuilder<>(double.class).numNulls(6).numDVs(7)
.low(1d).high(7d).hll(1, 2, 3).build();
Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true);

// the aggregation does not update hll, only numNDVs is, it keeps the first hll
// the aggregation does not update hll, only numDVs is, it keeps the first hll
// notice that numDVs is computed by using HLL, it can detect that '3' appears twice
ColumnStatisticsData expectedStats = new ColStatsBuilder<>(long.class).numNulls(6).numDVs(7)
.low(1L).high(7L).hll(1, 2, 3).build();
Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
TimestampColumnStatsAggregator aggregator = new TimestampColumnStatsAggregator();
ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList, partitions, true);

// the aggregation does not update hll, only numNDVs is, it keeps the first hll
// the aggregation does not update hll, only numDVs is, it keeps the first hll
// notice that numDVs is computed by using HLL, it can detect that 'TS_3' appears twice
ColumnStatisticsData expectedStats = new ColStatsBuilder<>(Timestamp.class).numNulls(6).numDVs(7)
.low(TS_1).high(TS_7).hll(values1).build();

Expand Down

0 comments on commit e2fcd42

Please sign in to comment.