Skip to content
Permalink
Browse files
Introducing a new config to ignore nulls while computing String Cardi…
…nality (#12345)

* Counting nulls in String cardinality with a config

* Adding tests for the new config

* Wrapping the vectorize part to allow backward compatibility

* Adding different tests, cleaning the code and putting the check at the proper position, handling hasRow() and hasValue() changes

* Updating testcase and code

* Adding null handling test to improve coverage

* Checkstyle fix

* Adding 1 more change in docs

* Making docs clearer
  • Loading branch information
somu-imply committed Mar 29, 2022
1 parent f1841c6 commit a1ea6581156bf415fe991cebf74182b24d5b2994
Showing 8 changed files with 193 additions and 14 deletions.
@@ -58,7 +58,13 @@
@VisibleForTesting
public static void initializeForTests()
{
INSTANCE = new NullValueHandlingConfig(null);
INSTANCE = new NullValueHandlingConfig(null, null);
}

@VisibleForTesting
public static void initializeForTestsWithValues(Boolean useDefForNull, Boolean ignoreNullForString)
{
INSTANCE = new NullValueHandlingConfig(useDefForNull, ignoreNullForString);
}

/**
@@ -73,6 +79,18 @@ public static boolean replaceWithDefault()
return INSTANCE.isUseDefaultValuesForNull();
}

/**
* whether nulls should be counted during String cardinality
*/
public static boolean ignoreNullsForStringCardinality()
{
// this should only be null in a unit test context, in production this will be injected by the null handling module
if (INSTANCE == null) {
throw new IllegalStateException("NullHandling module not initialized, call NullHandling.initializeForTests()");
}
return INSTANCE.isIgnoreNullsForStringCardinality();
}

public static boolean sqlCompatible()
{
return !replaceWithDefault();
@@ -26,20 +26,46 @@
{
public static final String NULL_HANDLING_CONFIG_STRING = "druid.generic.useDefaultValueForNull";

//added to preserve backward compatibility
//and not count nulls during cardinality aggrgation over strings

public static final String NULL_HANDLING_DURING_STRING_CARDINALITY = "druid.generic.ignoreNullsForStringCardinality";

@JsonProperty("useDefaultValueForNull")
private final boolean useDefaultValuesForNull;

@JsonProperty("ignoreNullsForStringCardinality")
private final boolean ignoreNullsForStringCardinality;


@JsonCreator
public NullValueHandlingConfig(
@JsonProperty("useDefaultValueForNull") Boolean useDefaultValuesForNull
@JsonProperty("useDefaultValueForNull") Boolean useDefaultValuesForNull,
@JsonProperty("ignoreNullsForStringCardinality") Boolean ignoreNullsForStringCardinality
)
{
if (useDefaultValuesForNull == null) {
this.useDefaultValuesForNull = Boolean.valueOf(System.getProperty(NULL_HANDLING_CONFIG_STRING, "true"));
} else {
this.useDefaultValuesForNull = useDefaultValuesForNull;
}
if (ignoreNullsForStringCardinality == null) {
this.ignoreNullsForStringCardinality = Boolean.valueOf(System.getProperty(
NULL_HANDLING_DURING_STRING_CARDINALITY,
"false"
));
} else {
if (this.useDefaultValuesForNull) {
this.ignoreNullsForStringCardinality = ignoreNullsForStringCardinality;
} else {
this.ignoreNullsForStringCardinality = false;
}
}
}

public boolean isIgnoreNullsForStringCardinality()
{
return ignoreNullsForStringCardinality;
}

public boolean isUseDefaultValuesForNull()
@@ -90,4 +90,15 @@ public void test_defaultValueForClass_object()
{
Assert.assertNull(NullHandling.defaultValueForClass(Object.class));
}

@Test
public void test_ignoreNullsStrings()
{
NullHandling.initializeForTestsWithValues(false, true);
Assert.assertFalse(NullHandling.ignoreNullsForStringCardinality());

NullHandling.initializeForTestsWithValues(true, false);
Assert.assertFalse(NullHandling.ignoreNullsForStringCardinality());

}
}
@@ -761,7 +761,7 @@ Prior to version 0.13.0, Druid string columns treated `''` and `null` values as
|Property|Description|Default|
|---|---|---|
|`druid.generic.useDefaultValueForNull`|When set to `true`, `null` values will be stored as `''` for string columns and `0` for numeric columns. Set to `false` to store and query data in SQL compatible mode.|`true`|

|`druid.generic.ignoreNullsForStringCardinality`|When set to `true`, `null` values will be ignored for the built-in cardinality aggregator over string columns. Set to `false` to include `null` values while estimating cardinality of only string columns using the built-in cardinality aggregator. This setting takes effect only when `druid.generic.useDefaultValueForNull` is set to `true` and is ignored in SQL compatibility mode. Additionally, empty strings (equivalent to null) are not counted when this is set to `true`. |`false`|
This mode does have a storage size and query performance cost, see [segment documentation](../design/segments.md#sql-compatible-null-handling) for more details.

### HTTP Client
@@ -39,7 +39,8 @@ public static void addStringToCollector(final HyperLogLogCollector collector, @N
// SQL standard spec does not count null values,
// Skip counting null values when we are not replacing null with default value.
// A special value for null in case null handling is configured to use empty string for null.
if (NullHandling.replaceWithDefault() || s != null) {
// check if nulls are to be ignored
if ((NullHandling.replaceWithDefault() && !NullHandling.ignoreNullsForStringCardinality()) || s != null) {
collector.add(CardinalityAggregator.HASH_FUNCTION.hashUnencodedChars(nullToSpecial(s)).asBytes());
}
}
@@ -51,6 +51,7 @@ public void aggregate(ByteBuffer buf, int position, int startRow, int endRow)

final HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(buf);


for (int i = startRow; i < endRow; i++) {
final String value = selector.lookupName(vector[i]);
StringCardinalityAggregatorColumnSelectorStrategy.addStringToCollector(collector, value);
@@ -74,7 +75,6 @@ public void aggregate(ByteBuffer buf, int numRows, int[] positions, @Nullable in

for (int i = 0; i < numRows; i++) {
final String s = selector.lookupName(vector[rows != null ? rows[i] : i]);

if (NullHandling.replaceWithDefault() || s != null) {
final int position = positions[i] + positionOffset;
buf.limit(position + HyperLogLogCollector.getLatestNumBytesForDenseStorage());
@@ -381,12 +381,12 @@ public CardinalityAggregatorTest()
@Test
public void testAggregateRows()
{
NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoList,
true
);


for (int i = 0; i < VALUES1.size(); ++i) {
aggregate(selectorList, agg);
}
@@ -397,6 +397,7 @@ public void testAggregateRows()
@Test
public void testAggregateValues()
{
NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoList,
false
@@ -405,13 +406,22 @@ public void testAggregateValues()
for (int i = 0; i < VALUES1.size(); ++i) {
aggregate(selectorList, agg);
}
Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0, (Double) valueAggregatorFactory.finalizeComputation(agg.get()), 0.05);
Assert.assertEquals(NullHandling.replaceWithDefault() ? 7L : 6L, rowAggregatorFactoryRounded.finalizeComputation(agg.get()));
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg.get()),
0.05
);
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 7L : 6L,
rowAggregatorFactoryRounded.finalizeComputation(agg.get())
);

}

@Test
public void testBufferAggregateRows()
{
NullHandling.initializeForTestsWithValues(null, null);
CardinalityBufferAggregator agg = new CardinalityBufferAggregator(
dimInfoList.toArray(new ColumnSelectorPlus[0]),
true
@@ -434,6 +444,7 @@ public void testBufferAggregateRows()
@Test
public void testBufferAggregateValues()
{
NullHandling.initializeForTestsWithValues(null, null);
CardinalityBufferAggregator agg = new CardinalityBufferAggregator(
dimInfoList.toArray(new ColumnSelectorPlus[0]),
false
@@ -449,13 +460,21 @@ public void testBufferAggregateValues()
for (int i = 0; i < VALUES1.size(); ++i) {
bufferAggregate(selectorList, agg, buf, pos);
}
Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0, (Double) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)), 0.05);
Assert.assertEquals(NullHandling.replaceWithDefault() ? 7L : 6L, rowAggregatorFactoryRounded.finalizeComputation(agg.get(buf, pos)));
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg.get(buf, pos)),
0.05
);
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 7L : 6L,
rowAggregatorFactoryRounded.finalizeComputation(agg.get(buf, pos))
);
}

@Test
public void testCombineRows()
{
NullHandling.initializeForTestsWithValues(null, null);
List<DimensionSelector> selector1 = Collections.singletonList(dim1);
List<DimensionSelector> selector2 = Collections.singletonList(dim2);
List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>> dimInfo1 = Collections.singletonList(
@@ -501,6 +520,7 @@ public void testCombineRows()
@Test
public void testCombineValues()
{
NullHandling.initializeForTestsWithValues(null, null);
List<DimensionSelector> selector1 = Collections.singletonList(dim1);
List<DimensionSelector> selector2 = Collections.singletonList(dim2);

@@ -528,10 +548,16 @@ public void testCombineValues()
for (int i = 0; i < VALUES2.size(); ++i) {
aggregate(selector2, agg2);
}

Assert.assertEquals(NullHandling.replaceWithDefault() ? 4.0 : 3.0, (Double) valueAggregatorFactory.finalizeComputation(agg1.get()), 0.05);
Assert.assertEquals(NullHandling.replaceWithDefault() ? 7.0 : 6.0, (Double) valueAggregatorFactory.finalizeComputation(agg2.get()), 0.05);

Assert.assertEquals(
NullHandling.replaceWithDefault() ? 4.0 : 3.0,
(Double) valueAggregatorFactory.finalizeComputation(agg1.get()),
0.05
);
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg2.get()),
0.05
);
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 7.0 : 6.0,
(Double) rowAggregatorFactory.finalizeComputation(
@@ -547,6 +573,7 @@ public void testCombineValues()
@Test
public void testAggregateRowsWithExtraction()
{
NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoListWithExtraction,
true
@@ -569,6 +596,7 @@ public void testAggregateRowsWithExtraction()
@Test
public void testAggregateValuesWithExtraction()
{
NullHandling.initializeForTestsWithValues(null, null);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoListWithExtraction,
false
@@ -635,4 +663,98 @@ public void testSerde() throws Exception
objectMapper.readValue(objectMapper.writeValueAsString(factory2), AggregatorFactory.class)
);
}

//ignoreNullsForStringCardinality tests
@Test
public void testAggregateRowsIgnoreNulls()
{
NullHandling.initializeForTestsWithValues(null, true);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoList,
true
);

for (int i = 0; i < VALUES1.size(); ++i) {
aggregate(selectorList, agg);
}
Assert.assertEquals(9.0, (Double) rowAggregatorFactory.finalizeComputation(agg.get()), 0.05);
Assert.assertEquals(9L, rowAggregatorFactoryRounded.finalizeComputation(agg.get()));
}

@Test
public void testAggregateValuesIgnoreNulls()
{
NullHandling.initializeForTestsWithValues(null, true);
CardinalityAggregator agg = new CardinalityAggregator(
dimInfoList,
false
);

for (int i = 0; i < VALUES1.size(); ++i) {
aggregate(selectorList, agg);
}
//setting is not applied when druid.generic.useDefaultValueForNull=false
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 6.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg.get()),
0.05
);
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 6L : 6L,
rowAggregatorFactoryRounded.finalizeComputation(agg.get())
);
}

@Test
public void testCombineValuesIgnoreNulls()
{
NullHandling.initializeForTestsWithValues(null, true);
List<DimensionSelector> selector1 = Collections.singletonList(dim1);
List<DimensionSelector> selector2 = Collections.singletonList(dim2);

List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>> dimInfo1 = Collections.singletonList(
new ColumnSelectorPlus<>(
dimSpec1.getDimension(),
dimSpec1.getOutputName(),
new StringCardinalityAggregatorColumnSelectorStrategy(), dim1
)
);
List<ColumnSelectorPlus<CardinalityAggregatorColumnSelectorStrategy>> dimInfo2 = Collections.singletonList(
new ColumnSelectorPlus<>(
dimSpec1.getDimension(),
dimSpec1.getOutputName(),
new StringCardinalityAggregatorColumnSelectorStrategy(), dim2
)
);

CardinalityAggregator agg1 = new CardinalityAggregator(dimInfo1, false);
CardinalityAggregator agg2 = new CardinalityAggregator(dimInfo2, false);

for (int i = 0; i < VALUES1.size(); ++i) {
aggregate(selector1, agg1);
}
for (int i = 0; i < VALUES2.size(); ++i) {
aggregate(selector2, agg2);
}
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 3.0 : 3.0,
(Double) valueAggregatorFactory.finalizeComputation(agg1.get()),
0.05
);
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 6.0 : 6.0,
(Double) valueAggregatorFactory.finalizeComputation(agg2.get()),
0.05
);
Assert.assertEquals(
NullHandling.replaceWithDefault() ? 6.0 : 6.0,
(Double) rowAggregatorFactory.finalizeComputation(
rowAggregatorFactory.combine(
agg1.get(),
agg2.get()
)
),
0.05
);
}
}
@@ -694,6 +694,7 @@ doubleMeanNoNulls
doubleMin
doubleSum
druid.generic.useDefaultValueForNull
druid.generic.ignoreNullsForStringCardinality
limitSpec
longMax
longAny

0 comments on commit a1ea658

Please sign in to comment.