Skip to content

Commit

Permalink
fix issue with auto column grouping (#16489)
Browse files Browse the repository at this point in the history
* fix issue with auto column grouping
changes:
* fixes bug where AutoTypeColumnIndexer reports incorrect cardinality, allowing it to incorrectly use array grouper algorithm for realtime queries producing incorrect results for strings
* fixes bug where auto LONG and DOUBLE type columns incorrectly report not having null values, resulting in incorrect null handling when grouping

* fix test
  • Loading branch information
clintropolis authored May 27, 2024
1 parent 6bc2953 commit 4e1de50
Show file tree
Hide file tree
Showing 6 changed files with 315 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ public StructuredData getMaxValue()
@Override
public int getCardinality()
{
return globalDictionary.getCardinality();
return DimensionDictionarySelector.CARDINALITY_UNKNOWN;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ public void read(ByteBuffer buffer, ColumnBuilder builder, ColumnConfig columnCo
capabilitiesBuilder.setDictionaryValuesSorted(true);
capabilitiesBuilder.setDictionaryValuesUnique(true);
builder.setType(logicalType);
builder.setHasNulls(hasNulls);
builder.setNestedCommonFormatColumnSupplier(supplier);
builder.setIndexSupplier(supplier, true, false);
builder.setColumnFormat(new NestedCommonFormatColumn.Format(logicalType, capabilitiesBuilder.hasNulls().isTrue(), enforceLogicalType));
Expand All @@ -225,6 +226,7 @@ public void read(ByteBuffer buffer, ColumnBuilder builder, ColumnConfig columnCo
// technically, these columns are dictionary encoded, however they do not implement the DictionaryEncodedColumn
// interface, so do not make the claim in the ColumnCapabilities
builder.setType(logicalType);
builder.setHasNulls(hasNulls);
builder.setNestedCommonFormatColumnSupplier(supplier);
builder.setIndexSupplier(supplier, true, false);
builder.setColumnFormat(new NestedCommonFormatColumn.Format(logicalType, capabilitiesBuilder.hasNulls().isTrue(), enforceLogicalType));
Expand All @@ -247,6 +249,7 @@ public void read(ByteBuffer buffer, ColumnBuilder builder, ColumnConfig columnCo
// technically, these columns are dictionary encoded, however they do not implement the DictionaryEncodedColumn
// interface, so do not make the claim in the ColumnCapabilities
builder.setType(logicalType);
builder.setHasNulls(hasNulls);
builder.setNestedCommonFormatColumnSupplier(supplier);
builder.setIndexSupplier(supplier, true, false);
builder.setColumnFormat(new NestedCommonFormatColumn.Format(logicalType, capabilitiesBuilder.hasNulls().isTrue(), enforceLogicalType));
Expand Down Expand Up @@ -275,6 +278,7 @@ public void read(ByteBuffer buffer, ColumnBuilder builder, ColumnConfig columnCo
capabilitiesBuilder.setDictionaryValuesUnique(true);
}
builder.setType(logicalType);
builder.setHasNulls(hasNulls);
builder.setNestedCommonFormatColumnSupplier(supplier);
builder.setIndexSupplier(supplier, true, false);
builder.setColumnFormat(new NestedCommonFormatColumn.Format(
Expand Down Expand Up @@ -306,6 +310,7 @@ public void read(ByteBuffer buffer, ColumnBuilder builder, ColumnConfig columnCo
ColumnType simpleType = supplier.getLogicalType();
ColumnType logicalType = simpleType == null ? ColumnType.NESTED_DATA : simpleType;
builder.setType(logicalType);
builder.setHasNulls(hasNulls);
builder.setNestedCommonFormatColumnSupplier(supplier);
// in default value mode, SQL planning by default uses selector filters for things like 'is null', which does
// not work correctly for complex types (or arrays). so, only hook up this index in sql compatible mode so that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,30 @@ public void testGroupBySomeFieldOnLongColumnFilterNil()
);
}

@Test
public void testGroupByRootAuto()
{
GroupByQuery groupQuery = GroupByQuery.builder()
.setDataSource("test_datasource")
.setGranularity(Granularities.ALL)
.setInterval(Intervals.ETERNITY)
.setDimensions(DefaultDimensionSpec.of("dim"))
.setAggregatorSpecs(new CountAggregatorFactory("count"))
.setContext(getContext())
.build();


runResults(
groupQuery,
ImmutableList.of(
new Object[]{"100", 2L},
new Object[]{"hello", 12L},
new Object[]{"world", 2L}
)
);
}


private void runResults(
GroupByQuery groupQuery,
List<Object[]> expectedResults
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,69 +70,69 @@ public void testKeySizeEstimation()
{
AutoTypeColumnIndexer indexer = new AutoTypeColumnIndexer("test", null);
int baseCardinality = NullHandling.sqlCompatible() ? 0 : 2;
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());

EncodedKeyComponent<StructuredData> key;
// new raw value, new field, new dictionary entry
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false);
Assert.assertEquals(228, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
// adding same value only adds estimated size of value itself
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of("x", "foo"), false);
Assert.assertEquals(112, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
// new raw value, new field, new dictionary entry
key = indexer.processRowValsToUnsortedEncodedKeyComponent(10L, false);
Assert.assertEquals(94, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 2, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 2, indexer.globalDictionary.getCardinality());
// adding same value only adds estimated size of value itself
key = indexer.processRowValsToUnsortedEncodedKeyComponent(10L, false);
Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 2, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 2, indexer.globalDictionary.getCardinality());
// new raw value, new dictionary entry
key = indexer.processRowValsToUnsortedEncodedKeyComponent(11L, false);
Assert.assertEquals(48, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 3, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 3, indexer.globalDictionary.getCardinality());

// new raw value, new fields
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 10L), false);
Assert.assertEquals(168, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 6, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality());
// new raw value, re-use fields and dictionary
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 10L), false);
Assert.assertEquals(104, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 6, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality());
// new raw value, new fields
key = indexer.processRowValsToUnsortedEncodedKeyComponent(
ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)),
false
);
Assert.assertEquals(166, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 6, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality());
// new raw value
key = indexer.processRowValsToUnsortedEncodedKeyComponent(
ImmutableMap.of("x", ImmutableList.of(1L, 2L, 10L)),
false
);
Assert.assertEquals(166, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 6, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 6, indexer.globalDictionary.getCardinality());

key = indexer.processRowValsToUnsortedEncodedKeyComponent("", false);
if (NullHandling.replaceWithDefault()) {
Assert.assertEquals(0, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 7, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 7, indexer.globalDictionary.getCardinality());
} else {
Assert.assertEquals(104, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 7, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 7, indexer.globalDictionary.getCardinality());
}

key = indexer.processRowValsToUnsortedEncodedKeyComponent(0L, false);
if (NullHandling.replaceWithDefault()) {
Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 7, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 7, indexer.globalDictionary.getCardinality());
} else {
Assert.assertEquals(48, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 8, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 8, indexer.globalDictionary.getCardinality());
}
}

Expand Down Expand Up @@ -673,14 +673,14 @@ public void testConstantNull()

key = indexer.processRowValsToUnsortedEncodedKeyComponent(null, true);
Assert.assertEquals(0, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(null, true);

Assert.assertEquals(0, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(null, true);
Assert.assertEquals(0, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());


Assert.assertTrue(indexer.hasNulls);
Expand All @@ -698,14 +698,14 @@ public void testConstantString()

key = indexer.processRowValsToUnsortedEncodedKeyComponent("abcd", true);
Assert.assertEquals(166, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent("abcd", true);

Assert.assertEquals(52, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent("abcd", true);
Assert.assertEquals(52, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());

Assert.assertFalse(indexer.hasNulls);
Assert.assertFalse(indexer.hasNestedData);
Expand All @@ -722,14 +722,14 @@ public void testConstantLong()

key = indexer.processRowValsToUnsortedEncodedKeyComponent(1234L, true);
Assert.assertEquals(94, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(1234L, true);

Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(1234L, true);
Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());

Assert.assertFalse(indexer.hasNulls);
Assert.assertFalse(indexer.hasNestedData);
Expand All @@ -746,14 +746,14 @@ public void testConstantEmptyArray()

key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(), true);
Assert.assertEquals(54, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(), true);

Assert.assertEquals(8, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(), true);
Assert.assertEquals(8, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 1, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 1, indexer.globalDictionary.getCardinality());

Assert.assertFalse(indexer.hasNulls);
Assert.assertFalse(indexer.hasNestedData);
Expand All @@ -770,14 +770,14 @@ public void testConstantArray()

key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 3L), true);
Assert.assertEquals(246, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 4, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 4, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 3L), true);

Assert.assertEquals(104, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 4, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 4, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableList.of(1L, 2L, 3L), true);
Assert.assertEquals(104, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality + 4, indexer.getCardinality());
Assert.assertEquals(baseCardinality + 4, indexer.globalDictionary.getCardinality());

Assert.assertFalse(indexer.hasNulls);
Assert.assertFalse(indexer.hasNestedData);
Expand All @@ -794,14 +794,14 @@ public void testConstantEmptyObject()

key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of(), true);
Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of(), true);

Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());
key = indexer.processRowValsToUnsortedEncodedKeyComponent(ImmutableMap.of(), true);
Assert.assertEquals(16, key.getEffectiveSizeBytes());
Assert.assertEquals(baseCardinality, indexer.getCardinality());
Assert.assertEquals(baseCardinality, indexer.globalDictionary.getCardinality());

Assert.assertFalse(indexer.hasNulls);
Assert.assertTrue(indexer.hasNestedData);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.druid.query.lookup.LookupExtractorFactoryContainer;
import org.apache.druid.query.lookup.LookupExtractorFactoryContainerProvider;
import org.apache.druid.segment.FrameBasedInlineSegmentWrangler;
import org.apache.druid.segment.IncrementalIndexSegment;
import org.apache.druid.segment.InlineSegmentWrangler;
import org.apache.druid.segment.LookupSegmentWrangler;
import org.apache.druid.segment.MapSegmentWrangler;
Expand All @@ -44,6 +45,7 @@
import org.apache.druid.segment.ReferenceCountingSegment;
import org.apache.druid.segment.Segment;
import org.apache.druid.segment.SegmentWrangler;
import org.apache.druid.segment.incremental.IncrementalIndex;
import org.apache.druid.segment.join.JoinableFactory;
import org.apache.druid.segment.join.JoinableFactoryWrapper;
import org.apache.druid.server.initialization.ServerConfig;
Expand Down Expand Up @@ -196,6 +198,11 @@ public SpecificSegmentsQuerySegmentWalker add(final DataSegment descriptor, fina
return add(descriptor, new QueryableIndexSegment(index, descriptor.getId()));
}

public SpecificSegmentsQuerySegmentWalker add(final DataSegment descriptor, final IncrementalIndex index)
{
return add(descriptor, new IncrementalIndexSegment(index, descriptor.getId()));
}

public List<DataSegment> getSegments()
{
return segments;
Expand Down
Loading

0 comments on commit 4e1de50

Please sign in to comment.