From 0590d1261daf640424c035660a025197e97b4d81 Mon Sep 17 00:00:00 2001 From: "Xiaotian (Jackie) Jiang" Date: Tue, 26 May 2026 14:04:50 -0700 Subject: [PATCH] Refine index-based DISTINCT operators (JSON / inverted) - JsonIndexDistinctOperator: validate 3/4/5-arg jsonExtractIndex in the constructor (mirroring JsonExtractIndexTransformFunction.init), accept MV `_ARRAY` types, intersect per-value doc ids with the WHERE-clause filter through a unified `remainingDocs` bitmap, and surface numDocsScanned in execution statistics. - New `jsonIndexDistinctSkipMissingPath` query option: when true, the operator skips parsing the 4-arg default, skips `remainingDocs` tracking, and never throws "Illegal Json Path". - `canUseJsonIndexDistinct` simplified to a function-name check; planner routes any `jsonExtractIndex` call through the operator and lets the constructor validate arguments. - InvertedIndexDistinctOperator: cache `_totalDocs`, short-circuit the DESC sorted path with `intersects` instead of `getLongCardinality`, drop redundant `advanceIfNeeded` / inner `hasNext`, and report a correct numDocsScanned for sorted / inverted paths. - Tests rewritten as queries-based suites (`JsonIndexDistinctOperatorQueriesTest`, `InvertedIndexDistinctOperatorQueriesTest`) that drive the full broker -> operator path and assert on execution statistics; the older mock-based unit tests are removed. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../utils/config/QueryOptionsUtils.java | 7 + .../query/InvertedIndexDistinctOperator.java | 289 +++--- .../query/JsonIndexDistinctOperator.java | 703 ++++++--------- .../JsonExtractIndexTransformFunction.java | 41 +- .../pinot/core/plan/DistinctPlanNode.java | 4 +- ...InvertedIndexDistinctOperatorUnitTest.java | 233 ----- .../query/JsonIndexDistinctOperatorTest.java | 306 ------- ...rtedIndexDistinctOperatorQueriesTest.java} | 327 +++---- .../JsonIndexDistinctOperatorQueriesTest.java | 447 ++++++++++ .../tests/custom/JsonPathTest.java | 825 +++++++++--------- .../pinot/spi/utils/CommonConstants.java | 5 + 11 files changed, 1443 insertions(+), 1744 deletions(-) delete mode 100644 pinot-core/src/test/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperatorUnitTest.java delete mode 100644 pinot-core/src/test/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperatorTest.java rename pinot-core/src/test/java/org/apache/pinot/queries/{InvertedIndexDistinctOperatorTest.java => InvertedIndexDistinctOperatorQueriesTest.java} (73%) create mode 100644 pinot-core/src/test/java/org/apache/pinot/queries/JsonIndexDistinctOperatorQueriesTest.java diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java index 11c507639cf8..52829dc07f10 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java @@ -192,6 +192,13 @@ public static Double getInvertedIndexDistinctCostRatio(Map query queryOptions.get(QueryOptionKey.INVERTED_INDEX_DISTINCT_COST_RATIO)); } + /// When true, [org.apache.pinot.core.operator.query.JsonIndexDistinctOperator] skips its missing-path handling — + /// does not add a 4-arg default, does not add null, and does not throw `Illegal Json Path`. The distinct set is + /// purely the values returned by the JSON-index lookup. + public static boolean isJsonIndexDistinctSkipMissingPath(Map queryOptions) { + return Boolean.parseBoolean(queryOptions.get(QueryOptionKey.JSON_INDEX_DISTINCT_SKIP_MISSING_PATH)); + } + public static boolean isSkipScanFilterReorder(Map queryOptions) { return "false".equalsIgnoreCase(queryOptions.get(QueryOptionKey.USE_SCAN_REORDER_OPTIMIZATION)); } diff --git a/pinot-core/src/main/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperator.java b/pinot-core/src/main/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperator.java index c57a0e5b2425..06ab1c7cca21 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperator.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperator.java @@ -67,33 +67,30 @@ import org.apache.pinot.spi.utils.Pairs; import org.roaringbitmap.PeekableIntIterator; import org.roaringbitmap.buffer.ImmutableRoaringBitmap; -import org.roaringbitmap.buffer.MutableRoaringBitmap; -/** - * Inverted-index-based operator for single-column distinct queries on a single segment. - * - *

Supports three execution paths, chosen at runtime: - *

    - *
  • Sorted index path: For sorted columns, merge-iterates filter bitmap against contiguous doc ranges. - * Cost ~ O(cardinality + filteredDocs). Always chosen when the column has a sorted forward index.
  • - *
  • Bitmap inverted index path: Iterates dictionary entries and uses inverted index bitmap intersections - * to check filter membership. Avoids the projection pipeline entirely. Chosen by cost heuristic when dictionary - * cardinality is much smaller than the filtered doc count.
  • - *
  • Scan path (fallback): Uses ProjectOperator + DistinctExecutor to scan filtered docs. - * Used when the cost heuristic determines scanning is cheaper.
  • - *
- * - *

Enabled via the {@code useIndexBasedDistinctOperator} query option. The cost ratio can be tuned - * via the {@code invertedIndexDistinctCostRatio} query option; setting it to 0 forces the inverted index path - * for non-empty filter results. - */ +/// Inverted-index-based operator for single-column distinct queries on a single segment. +/// +/// Supports three execution paths, chosen at runtime: +/// - **Sorted index path**: For sorted columns, merge-iterates filter bitmap against contiguous doc ranges. +/// Cost ~ `O(cardinality + filteredDocs)`. +/// Always chosen when the column has a sorted forward index. +/// - **Bitmap inverted index path**: Iterates dictionary entries and uses inverted index bitmap intersections to check +/// filter membership. Avoids the projection pipeline entirely. +/// Chosen by cost heuristic when dictionary cardinality is much smaller than the filtered doc count. +/// - **Scan path (fallback)**: Uses ProjectOperator + DistinctExecutor to scan filtered docs. +/// Used when the cost heuristic determines scanning is cheaper. +/// +/// Enabled via the `useIndexBasedDistinctOperator` query option. The cost ratio can be tuned via the +/// `invertedIndexDistinctCostRatio` query option; setting it to 0 forces the inverted index path for non-empty filter +/// results. public class InvertedIndexDistinctOperator extends BaseOperator { private static final String EXPLAIN_NAME = "DISTINCT_INVERTED_INDEX"; private static final String EXPLAIN_NAME_SORTED_INDEX = "DISTINCT_SORTED_INDEX"; private static final String EXPLAIN_NAME_SCAN_FALLBACK = "DISTINCT"; private final IndexSegment _indexSegment; + private final int _totalDocs; private final SegmentContext _segmentContext; private final QueryContext _queryContext; private final BaseFilterOperator _filterOperator; @@ -107,16 +104,15 @@ public class InvertedIndexDistinctOperator extends BaseOperator) _invertedIndexReader, filteredDocIds); + BaseFilterOperator.FilteredDocIds filteredDocIds = _filterOperator.getFilteredDocIds(); + ImmutableRoaringBitmap docIds = filteredDocIds.getDocIds(); + _numDocsScanned = docIds != null ? docIds.getCardinality() : _totalDocs; + _numEntriesScannedInFilter = filteredDocIds.getNumEntriesScannedInFilter(); + return executeSortedIndexPath((SortedIndexReader) _invertedIndexReader, docIds); } - // Prefer cheap count-only inputs for the heuristic so scan fallback can keep the original filter pipeline. - FilterPreparation filterPreparation = prepareBitmapPathInput(); - Integer filteredDocCount = filterPreparation.getFilteredDocCount(); - - if (filteredDocCount != null) { - if (filteredDocCount == 0) { - return createEmptyResultsBlock(); - } - // Bitmap inverted index: use cost heuristic to decide - if (shouldUseBitmapInvertedIndex(filteredDocCount)) { - ImmutableRoaringBitmap filteredDocIds = filterPreparation.getFilteredDocIds(); - if (filteredDocIds == null) { - filteredDocIds = buildFilteredDocIds(); - } - _usedInvertedIndexPath = true; - return executeInvertedIndexPath(filteredDocIds); - } + ImmutableRoaringBitmap matchingDocIds = null; + int numMatchingDocs = -1; + if (_filterOperator.isResultMatchingAll()) { + numMatchingDocs = _totalDocs; + } else if (_filterOperator.canProduceBitmaps()) { + matchingDocIds = _filterOperator.getBitmaps().reduce(); + numMatchingDocs = matchingDocIds.getCardinality(); + } + if (numMatchingDocs == 0) { + return createEmptyResultsBlock(); + } + if (numMatchingDocs > 0 && shouldUseBitmapInvertedIndex(numMatchingDocs)) { + _usedInvertedIndexPath = true; + _numDocsScanned = numMatchingDocs; + return executeInvertedIndexPath(matchingDocIds); } - return executeScanPath(filterPreparation.getFilteredDocIds()); + return executeScanPath(matchingDocIds); } // ==================== Cost Heuristic ==================== - /** - * Default cost ratios for the inverted-index-based distinct heuristic, keyed by dictionary cardinality threshold. - * The inverted index path is chosen when {@code dictionaryCardinality * costRatio <= filteredDocCount}. - * - *

The cost ratio accounts for the per-entry bitmap intersection cost relative to the per-doc scan cost. - * For low-cardinality dictionaries, each bitmap is dense and {@code intersects()} is fast, but there are few - * entries so any unnecessary intersection is relatively expensive vs. scanning a small filtered doc set. - * For high-cardinality dictionaries, bitmaps are sparser and {@code intersects()} is slower per entry, - * but the scan path also becomes cheaper (fewer docs per value), so a lower ratio suffices. - * - *

Benchmarking (BenchmarkInvertedIndexDistinct, 1M docs) shows the crossover points: - *

    - *
  • dictCard ≤ 1K: costRatio=30 — inverted index wins when filteredDocs ≥ ~30x dictCard
  • - *
  • dictCard ≤ 10K: costRatio=10 — inverted index wins when filteredDocs ≥ ~10x dictCard
  • - *
  • dictCard > 10K: costRatio=6 — inverted index wins when filteredDocs ≥ ~6x dictCard
  • - *
- * - *

Can be overridden at query time via the query option {@code invertedIndexDistinctCostRatio}. Setting it - * to 0 forces the inverted index path for non-empty filter results. - */ - static final NavigableMap DEFAULT_COST_RATIO_BY_CARDINALITY; + /// Default cost ratios for the inverted-index-based distinct heuristic, keyed by dictionary cardinality threshold. + /// The inverted index path is chosen when `dictionaryCardinality * costRatio <= filteredDocCount`. + /// + /// The cost ratio accounts for the per-entry bitmap intersection cost relative to the per-doc scan cost. + /// For low-cardinality dictionaries, each bitmap is dense and `intersects()` is fast, but there are few entries so + /// any unnecessary intersection is relatively expensive vs. scanning a small filtered doc set. + /// For high-cardinality dictionaries, bitmaps are sparser and `intersects()` is slower per entry, but the scan path + /// also becomes cheaper (fewer docs per value), so a lower ratio suffices. + /// + /// Benchmarking (BenchmarkInvertedIndexDistinct, 1M docs) shows the crossover points: + /// - dictCard ≤ 1K: costRatio=30 — inverted index wins when filteredDocs ≥ ~30x dictCard + /// - dictCard ≤ 10K: costRatio=10 — inverted index wins when filteredDocs ≥ ~10x dictCard + /// - dictCard > 10K: costRatio=6 — inverted index wins when filteredDocs ≥ ~6x dictCard + /// + /// Can be overridden at query time via the query option `invertedIndexDistinctCostRatio`. Setting it to 0 forces the + /// inverted index path for non-empty filter results. + private static final NavigableMap DEFAULT_COST_RATIO_BY_CARDINALITY; static { TreeMap map = new TreeMap<>(); @@ -192,9 +189,6 @@ static double getDefaultCostRatio(int dictionaryCardinality) { } private boolean shouldUseBitmapInvertedIndex(int filteredDocCount) { - if (filteredDocCount == 0) { - return false; - } Double costRatioOverride = QueryOptionsUtils.getInvertedIndexDistinctCostRatio(_queryContext.getQueryOptions()); if (costRatioOverride != null && costRatioOverride == 0.0) { return true; @@ -204,59 +198,15 @@ private boolean shouldUseBitmapInvertedIndex(int filteredDocCount) { return (double) dictionaryCardinality * costRatio <= filteredDocCount; } - static final class FilterPreparation { - @Nullable - private final ImmutableRoaringBitmap _filteredDocIds; - @Nullable - private final Integer _filteredDocCount; - - private FilterPreparation(@Nullable ImmutableRoaringBitmap filteredDocIds, @Nullable Integer filteredDocCount) { - _filteredDocIds = filteredDocIds; - _filteredDocCount = filteredDocCount; - } - - @Nullable - ImmutableRoaringBitmap getFilteredDocIds() { - return _filteredDocIds; - } - - @Nullable - Integer getFilteredDocCount() { - return _filteredDocCount; - } - } - - FilterPreparation prepareBitmapPathInput() { - int totalDocs = _indexSegment.getSegmentMetadata().getTotalDocs(); - if (_filterOperator.isResultMatchingAll()) { - return new FilterPreparation(null, totalDocs); - } - if (_filterOperator.isResultEmpty()) { - return new FilterPreparation(new MutableRoaringBitmap(), 0); - } - // Prefer the cheaper exact count when available so scan fallback does not pay eager bitmap materialization. - if (_filterOperator.canOptimizeCount()) { - return new FilterPreparation(null, _filterOperator.getNumMatchingDocs()); - } - if (_filterOperator.canProduceBitmaps()) { - ImmutableRoaringBitmap filteredDocIds = _filterOperator.getBitmaps().reduce(); - return new FilterPreparation(filteredDocIds, filteredDocIds.getCardinality()); - } - return new FilterPreparation(null, null); - } - // ==================== Scan Path (Fallback) ==================== - /** - * Scan fallback: uses ProjectOperator + DistinctExecutor. When an exact filter bitmap is already cheaply available, - * wraps it in a {@link BitmapBasedFilterOperator} to avoid re-evaluating the filter through the projection pipeline. - * Otherwise preserves the original filter operator so scan fallback does not pay eager bitmap materialization. - */ + /// Scan fallback: uses ProjectOperator + DistinctExecutor. When an exact filter bitmap is already cheaply available, + /// wraps it in a [BitmapBasedFilterOperator] to avoid re-evaluating the filter through the projection pipeline. + /// Otherwise, preserves the original filter operator so scan fallback does not pay eager bitmap materialization. private DistinctResultsBlock executeScanPath(@Nullable ImmutableRoaringBitmap filteredDocIds) { BaseFilterOperator filterOp; if (filteredDocIds != null) { - filterOp = new BitmapBasedFilterOperator(filteredDocIds, false, - _indexSegment.getSegmentMetadata().getTotalDocs()); + filterOp = new BitmapBasedFilterOperator(filteredDocIds, false, _totalDocs); } else { filterOp = _filterOperator; } @@ -275,10 +225,8 @@ private DistinctResultsBlock executeScanPath(@Nullable ImmutableRoaringBitmap fi // ==================== Sorted Index Path ==================== - /** - * Optimized path for sorted columns. Each dictId maps to a contiguous doc range [start, end]. - * We merge-iterate the filter bitmap with the sorted ranges in O(cardinality + filteredDocs). - */ + /// Optimized path for sorted columns. Each dictId maps to a contiguous doc range [start,end]. + /// We merge-iterate the filter bitmap with the sorted ranges in O(cardinality + filteredDocs). private DistinctResultsBlock executeSortedIndexPath(SortedIndexReader sortedReader, @Nullable ImmutableRoaringBitmap filteredDocIds) { OrderByExpressionContext orderByExpression = @@ -300,13 +248,12 @@ private DistinctResultsBlock executeSortedIndexPath(SortedIndexReader sortedR if (nonNullFilteredDocIds == null) { // No filter, no null exclusion — every dictionary value is present - int entriesExamined = 0; int start = iterateReverse ? dictLength - 1 : 0; int end = iterateReverse ? -1 : dictLength; int step = iterateReverse ? -1 : 1; for (int dictId = start; dictId != end; dictId += step) { - QueryThreadContext.checkTerminationAndSampleUsagePeriodically(entriesExamined, EXPLAIN_NAME_SORTED_INDEX); - entriesExamined++; + QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExaminedPostFilter++, + EXPLAIN_NAME_SORTED_INDEX); if (dictId == nullResult._nullPlaceholderDictId) { continue; } @@ -315,50 +262,40 @@ private DistinctResultsBlock executeSortedIndexPath(SortedIndexReader sortedR break; } } - _numEntriesExamined = entriesExamined; } else if (!nonNullFilteredDocIds.isEmpty()) { if (iterateReverse) { - // DESC + LIMIT: iterate dictIds backward, use rangeCardinality for presence check. - // Each dictId maps to a contiguous doc range, so rangeCardinality is O(1) per check. - int entriesExamined = 0; + // DESC + LIMIT: iterate dictIds backward, use intersects for presence check. Each dictId maps to a contiguous + // doc range; intersects short-circuits on the first matching container. for (int dictId = dictLength - 1; dictId >= 0; dictId--) { - QueryThreadContext.checkTerminationAndSampleUsagePeriodically(entriesExamined, EXPLAIN_NAME_SORTED_INDEX); - entriesExamined++; + QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExaminedPostFilter++, + EXPLAIN_NAME_SORTED_INDEX); Pairs.IntPair range = sortedReader.getDocIds(dictId); int startDocId = range.getLeft(); int endDocId = range.getRight(); // inclusive - if (nonNullFilteredDocIds.rangeCardinality(startDocId, endDocId + 1L) > 0) { + if (nonNullFilteredDocIds.intersects(startDocId, endDocId + 1L)) { if (addDistinctValue(distinctTable, dictId, orderByExpression, true)) { break; } } } - _numEntriesExamined = entriesExamined; } else { - // ASC or no ORDER BY: merge-iterate forward (O(cardinality + filteredDocs)) + // ASC or no ORDER BY: merge-iterate forward (O(cardinality + filteredDocs)). Sorted-index ranges are + // contiguous over [0, numDocs), and we advance the filter iterator past `endDocId` after each match, so + // `peekNext()` is always >= the current dictId's `startDocId` at the top of each iteration. PeekableIntIterator filterIter = nonNullFilteredDocIds.getIntIterator(); - int dictId; - for (dictId = 0; dictId < dictLength && filterIter.hasNext(); dictId++) { - QueryThreadContext.checkTerminationAndSampleUsagePeriodically(dictId, EXPLAIN_NAME_SORTED_INDEX); - Pairs.IntPair range = sortedReader.getDocIds(dictId); - int startDocId = range.getLeft(); - int endDocId = range.getRight(); // inclusive - - // Skip filter docs before this range - filterIter.advanceIfNeeded(startDocId); - - // Check if any non-null filter doc falls within this range - if (filterIter.hasNext() && filterIter.peekNext() <= endDocId) { + for (int dictId = 0; dictId < dictLength && filterIter.hasNext(); dictId++) { + QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExaminedPostFilter++, + EXPLAIN_NAME_SORTED_INDEX); + int endDocId = sortedReader.getDocIds(dictId).getRight(); // inclusive + if (filterIter.peekNext() <= endDocId) { boolean done = addDistinctValue(distinctTable, dictId, orderByExpression, orderedEarlyTermination); if (done) { - _numEntriesExamined = dictId + 1; return new DistinctResultsBlock(convertDistinctTable(distinctTable, nullResult._hasNull), _queryContext); } - // Advance past the current range for next dictId + // Advance past the current range so the next iteration's peekNext() is >= the next dictId's startDocId. filterIter.advanceIfNeeded(endDocId + 1); } } - _numEntriesExamined = dictId; } } @@ -385,14 +322,11 @@ private DistinctResultsBlock executeInvertedIndexPath(@Nullable ImmutableRoaring boolean orderedEarlyTermination = useDictIdTable && orderByExpression != null && distinctTable.hasLimit(); boolean iterateReverse = orderedEarlyTermination && !orderByExpression.isAsc(); - int entriesExamined = 0; int start = iterateReverse ? dictLength - 1 : 0; int end = iterateReverse ? -1 : dictLength; int step = iterateReverse ? -1 : 1; - for (int dictId = start; dictId != end; dictId += step) { - QueryThreadContext.checkTerminationAndSampleUsagePeriodically(entriesExamined, EXPLAIN_NAME); - entriesExamined++; + QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExaminedPostFilter++, EXPLAIN_NAME); if (dictId == nullResult._nullPlaceholderDictId) { continue; } @@ -419,18 +353,10 @@ private DistinctResultsBlock executeInvertedIndexPath(@Nullable ImmutableRoaring } } } - _numEntriesExamined = entriesExamined; return new DistinctResultsBlock(convertDistinctTable(distinctTable, nullResult._hasNull), _queryContext); } - @Nullable - private ImmutableRoaringBitmap buildFilteredDocIds() { - BaseFilterOperator.FilteredDocIds filteredDocIds = _filterOperator.getFilteredDocIds(); - _numEntriesScannedInFilter = filteredDocIds.getNumEntriesScannedInFilter(); - return filteredDocIds.getDocIds(); - } - private boolean canUseDictIdDistinctTable(@Nullable OrderByExpressionContext orderByExpression) { return orderByExpression == null || _dictionary.isSorted(); } @@ -605,14 +531,12 @@ private boolean addDistinctValue(DistinctTable distinctTable, int dictId, // ==================== Null Handling ==================== - /** - * Processes null handling for the filter bitmap. Returns the filter bitmap with null docs excluded - * and whether any filtered docs have null values. - * - *

Nulls are not in the dictionary, so they must be checked separately via the null value vector. - * The null placeholder value (e.g., Integer.MIN_VALUE) is excluded from dictionary iteration by - * removing null docs from the filter bitmap. - */ + /// Processes null handling for the filter bitmap. Returns the filter bitmap with null docs excluded and whether any + /// filtered docs have null values. + /// + /// Nulls are not in the dictionary, so they must be checked separately via the null value vector. + /// The null placeholder value (e.g., `Integer.MIN_VALUE`) is excluded from dictionary iteration by removing null docs + /// from the filter bitmap. private NullFilterResult processNullDocs(@Nullable ImmutableRoaringBitmap filteredDocIds) { if (!_queryContext.isNullHandlingEnabled()) { return new NullFilterResult(filteredDocIds, false, Dictionary.NULL_VALUE_INDEX); @@ -633,6 +557,7 @@ private NullFilterResult processNullDocs(@Nullable ImmutableRoaringBitmap filter if (filteredDocIds == null) { // Preserve match-all to avoid materializing a dense complement bitmap. Instead skip the null placeholder dictId // while iterating dictionary values. + // TODO: This will count all default null values as null, regardless of whether they are actually nulls. nonNullFilteredDocIds = null; nullPlaceholderDictId = getNullPlaceholderDictId(); } else { @@ -680,13 +605,7 @@ private static class NullFilterResult { @Override public List getChildOperators() { - // If scan fallback was used, the project operator is the logical child - if (_projectOperator != null && !_usedInvertedIndexPath) { - return Collections.singletonList(_projectOperator); - } - // For inverted/sorted index paths (or before execution in EXPLAIN plans), - // the filter operator is the logical child. - return Collections.singletonList(_filterOperator); + return _projectOperator != null ? List.of(_projectOperator) : List.of(_filterOperator); } @Override @@ -696,21 +615,17 @@ public IndexSegment getIndexSegment() { @Override public ExecutionStatistics getExecutionStatistics() { - int numTotalDocs = _indexSegment.getSegmentMetadata().getTotalDocs(); - if (_usedInvertedIndexPath || _projectOperator == null) { - // For inverted/sorted index paths: numDocsScanned=0 (no forward index lookups), - // numEntriesScannedInFilter tracks work done while materializing the exact filter bitmap, - // numEntriesScannedPostFilter=numEntriesExamined (dictionary entries examined via bitmap - // intersection or sorted range checks). - return new ExecutionStatistics(0, _numEntriesScannedInFilter, _numEntriesExamined, numTotalDocs); + if (_projectOperator == null) { + // For inverted/sorted index paths: + // - numDocsScanned tracks the matching docs + // - numEntriesScannedInFilter tracks work done while materializing the exact filter bitmap + // - numEntriesScannedPostFilter tracks dictionary entries examined + return new ExecutionStatistics(_numDocsScanned, _numEntriesScannedInFilter, _numEntriesExaminedPostFilter, + _totalDocs); } - // _numEntriesScannedInFilter captures filter work from exact bitmap materialization (non-zero only when - // the filter could not produce bitmaps directly). The project operator's stats capture any additional - // filter work (zero when using a pre-built BitmapBasedFilterOperator). - long numEntriesScannedInFilter = _numEntriesScannedInFilter - + _projectOperator.getExecutionStatistics().getNumEntriesScannedInFilter(); // Single-column distinct, so numEntriesScannedPostFilter equals numDocsScanned - return new ExecutionStatistics(_numDocsScanned, numEntriesScannedInFilter, _numDocsScanned, numTotalDocs); + return new ExecutionStatistics(_numDocsScanned, + _projectOperator.getExecutionStatistics().getNumEntriesScannedInFilter(), _numDocsScanned, _totalDocs); } private String resolveExplainName() { diff --git a/pinot-core/src/main/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperator.java b/pinot-core/src/main/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperator.java index 0970cac305e3..a94dcd6253f3 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperator.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperator.java @@ -18,23 +18,20 @@ */ package org.apache.pinot.core.operator.query; +import com.fasterxml.jackson.databind.JsonNode; import com.google.common.base.CaseFormat; +import java.io.IOException; import java.math.BigDecimal; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import javax.annotation.Nullable; import org.apache.pinot.common.function.JsonPathCache; import org.apache.pinot.common.request.context.ExpressionContext; -import org.apache.pinot.common.request.context.FilterContext; import org.apache.pinot.common.request.context.OrderByExpressionContext; -import org.apache.pinot.common.request.context.RequestContextUtils; -import org.apache.pinot.common.request.context.predicate.JsonMatchPredicate; -import org.apache.pinot.common.request.context.predicate.Predicate; import org.apache.pinot.common.utils.DataSchema; import org.apache.pinot.common.utils.DataSchema.ColumnDataType; +import org.apache.pinot.common.utils.config.QueryOptionsUtils; import org.apache.pinot.core.common.Operator; import org.apache.pinot.core.operator.BaseOperator; import org.apache.pinot.core.operator.ExecutionStatistics; @@ -50,516 +47,396 @@ import org.apache.pinot.core.query.distinct.table.StringDistinctTable; import org.apache.pinot.core.query.request.context.QueryContext; import org.apache.pinot.segment.spi.IndexSegment; -import org.apache.pinot.segment.spi.SegmentContext; import org.apache.pinot.segment.spi.datasource.DataSource; import org.apache.pinot.segment.spi.index.IndexService; import org.apache.pinot.segment.spi.index.IndexType; import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; -import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.FieldSpec.DataType; import org.apache.pinot.spi.query.QueryThreadContext; -import org.apache.pinot.sql.parsers.CalciteSqlParser; +import org.apache.pinot.spi.utils.JsonUtils; import org.roaringbitmap.RoaringBitmap; import org.roaringbitmap.buffer.ImmutableRoaringBitmap; -/** - * Distinct operator for the scalar {@code jsonExtractIndex(column, path, type[, defaultValue])} form. - * - *

Execution flow: - * 1. Push a same-path {@code JSON_MATCH} predicate into the JSON-index lookup when it cannot match missing paths. - * 2. Convert matching flattened doc ids back to segment doc ids. - * 3. Apply any remaining row-level filter and materialize DISTINCT results, including missing-path handling. - */ +/// Distinct operator for `jsonExtractIndex(column, path, type[, defaultValue[, filterJsonExpression]])`. +/// +/// Supports both SV (e.g. `STRING`) and MV (e.g. `STRING_ARRAY`) result types — DISTINCT collapses MV array elements +/// to scalar rows, matching the scan-based `SELECT DISTINCT mvCol` convention. The 4-arg default is a single value +/// for SV; for MV it's a JSON array whose elements are each added to the distinct set when no doc matches the path. +/// +/// Execution flow: +/// 1. Pass the optional 5-arg `filterJsonExpression` directly to the JSON-index lookup (matches +/// `JsonExtractIndexTransformFunction`'s convention). +/// 2. Convert matching flattened doc ids back to segment doc ids. +/// 3. Apply any remaining row-level WHERE filter and materialize DISTINCT results, including missing-path handling. public class JsonIndexDistinctOperator extends BaseOperator { private static final String EXPLAIN_NAME = "DISTINCT_JSON_INDEX"; private static final String FUNCTION_NAME = "jsonExtractIndex"; + /// Returns true if the expression is a `jsonExtractIndex` function call. All other validation (argument count/types, + /// column existence, JSON index presence, path support) happens inside the operator's constructor and matches what + /// the scan-based fallback (`JsonExtractIndexTransformFunction`) would surface during its own `init`. + public static boolean canUseJsonIndexDistinct(ExpressionContext expr) { + return expr.getType() == ExpressionContext.Type.FUNCTION && FUNCTION_NAME.equalsIgnoreCase( + expr.getFunction().getFunctionName()); + } + private final IndexSegment _indexSegment; - private final SegmentContext _segmentContext; + private final int _totalDocs; private final QueryContext _queryContext; private final BaseFilterOperator _filterOperator; + private final ExpressionContext _expression; + private final boolean _skipMissingPath; + private final JsonIndexReader _jsonIndexReader; + private final String _jsonPathString; + private final DataType _dataType; + @Nullable + private final String[] _defaultValueLiterals; + @Nullable + private final String _filterJsonExpression; + private final DataSchema _dataSchema; + @Nullable + private final OrderByExpressionContext _orderByExpression; - private int _numEntriesExamined = 0; + private int _numDocsScanned = 0; private long _numEntriesScannedInFilter = 0; + private int _numEntriesExaminedPostFilter = 0; - public JsonIndexDistinctOperator(IndexSegment indexSegment, SegmentContext segmentContext, - QueryContext queryContext, BaseFilterOperator filterOperator) { + public JsonIndexDistinctOperator(IndexSegment indexSegment, QueryContext queryContext, + BaseFilterOperator filterOperator) { _indexSegment = indexSegment; - _segmentContext = segmentContext; + _totalDocs = indexSegment.getSegmentMetadata().getTotalDocs(); _queryContext = queryContext; _filterOperator = filterOperator; - } - - @Override - protected DistinctResultsBlock getNextBlock() { - List expressions = _queryContext.getSelectExpressions(); + List expressions = queryContext.getSelectExpressions(); if (expressions.size() != 1) { throw new IllegalStateException("JsonIndexDistinctOperator supports single expression only"); } + _expression = expressions.get(0); + _skipMissingPath = QueryOptionsUtils.isJsonIndexDistinctSkipMissingPath(queryContext.getQueryOptions()); - ExpressionContext expr = expressions.get(0); - ParsedJsonExtractIndex parsed = parseJsonExtractIndex(expr); - if (parsed == null) { - throw new IllegalStateException("Expected 3/4-arg scalar jsonExtractIndex expression"); - } + // Mirrors the arguments handling logic in `JsonExtractIndexTransformFunction` - DataSource dataSource = _indexSegment.getDataSource(parsed._columnName, _queryContext.getSchema()); - JsonIndexReader jsonIndexReader = getJsonIndexReader(dataSource); - if (jsonIndexReader == null) { - throw new IllegalStateException("Column " + parsed._columnName + " has no JSON index"); + List arguments = _expression.getFunction().getArguments(); + int numArguments = arguments.size(); + // Check that there are exactly 3 or 4 or 5 arguments + if (numArguments < 3 || numArguments > 5) { + throw new IllegalArgumentException( + "Expected 3/4/5 arguments for jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," + + " ['defaultValue'], ['jsonFilterExpression'])"); } - String pushedDownFilterJson = extractSamePathJsonMatchFilter(parsed, _queryContext.getFilter()); - boolean filterFullyPushedDown = pushedDownFilterJson != null - && isOnlySamePathJsonMatchFilter(parsed, _queryContext.getFilter()) - && !jsonMatchFilterCanMatchMissingPath(pushedDownFilterJson); - - // Fast path: when the filter is fully pushed down into the JSON index, we only need the distinct value strings. - // This avoids reading posting lists, building per-value bitmaps, and converting flattened doc IDs. - if (filterFullyPushedDown) { - Set distinctValues = jsonIndexReader.getMatchingDistinctValues( - parsed._jsonPathString, pushedDownFilterJson); - return buildDistinctResultsFromValues(expr, parsed, distinctValues); + ExpressionContext firstArgument = arguments.get(0); + if (firstArgument.getType() == ExpressionContext.Type.IDENTIFIER) { + DataSource dataSource = indexSegment.getDataSource(firstArgument.getIdentifier()); + _jsonIndexReader = getJsonIndexReader(dataSource); + if (_jsonIndexReader == null) { + throw new IllegalStateException("jsonExtractIndex can only be applied on a column with JSON index"); + } + } else { + throw new IllegalArgumentException("jsonExtractIndex can only be applied to a raw column"); } - // Evaluate the filter first so we can skip the (potentially expensive) index map when no docs match. - RoaringBitmap filteredDocIds = buildFilteredDocIds(); - if (filteredDocIds != null && filteredDocIds.isEmpty()) { - ColumnDataType earlyColumnDataType = ColumnDataType.fromDataTypeSV(parsed._dataType); - DataSchema earlyDataSchema = new DataSchema( - new String[]{expr.toString()}, - new ColumnDataType[]{earlyColumnDataType}); - OrderByExpressionContext earlyOrderBy = _queryContext.getOrderByExpressions() != null - ? _queryContext.getOrderByExpressions().get(0) : null; - return new DistinctResultsBlock( - createDistinctTable(earlyDataSchema, parsed._dataType, earlyOrderBy), _queryContext); + ExpressionContext secondArgument = arguments.get(1); + if (secondArgument.getType() != ExpressionContext.Type.LITERAL) { + throw new IllegalArgumentException("JSON path argument must be a literal"); + } + _jsonPathString = secondArgument.getLiteral().getStringValue(); + try { + JsonPathCache.INSTANCE.getOrCompute(_jsonPathString); + } catch (Exception e) { + throw new IllegalArgumentException("JSON path argument is not a valid JSON path"); } - // All other WHERE filters remain row-level and are applied after converting flattened doc IDs to real doc IDs. - Map valueToMatchingDocs = - jsonIndexReader.getMatchingFlattenedDocsMap(parsed._jsonPathString, pushedDownFilterJson); + ExpressionContext thirdArgument = arguments.get(2); + if (thirdArgument.getType() != ExpressionContext.Type.LITERAL) { + throw new IllegalArgumentException("Result type argument must be a literal"); + } + String resultsType = thirdArgument.getLiteral().getStringValue().toUpperCase(); + boolean isSingleValue = !resultsType.endsWith("_ARRAY"); + if (isSingleValue && _jsonPathString.contains("[*]")) { + throw new IllegalArgumentException( + "[*] syntax in json path is unsupported for singleValue field json_extract_index"); + } + String dataTypeName = isSingleValue ? resultsType : resultsType.substring(0, resultsType.length() - 6); + try { + _dataType = DataType.valueOf(dataTypeName); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Unknown jsonExtractIndex result type: " + resultsType); + } + switch (_dataType) { + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case BIG_DECIMAL: + case STRING: + break; + default: + throw new IllegalArgumentException("Unsupported jsonExtractIndex result type for distinct: " + _dataType); + } - // Always single-value (MV _ARRAY is rejected in parseJsonExtractIndex) - jsonIndexReader.convertFlattenedDocIdsToDocIds(valueToMatchingDocs); - return buildDistinctResultsBlock(expr, parsed, valueToMatchingDocs, filteredDocIds, - filteredDocIds == null); - } + // With _skipMissingPath, the 4-arg default is never used at runtime (handleMissingDocs is bypassed), so don't + // parse or validate it — accept any literal shape and ignore it. + if (numArguments >= 4 && !_skipMissingPath) { + ExpressionContext fourthArgument = arguments.get(3); + if (fourthArgument.getType() != ExpressionContext.Type.LITERAL) { + throw new IllegalArgumentException("Default value must be a literal"); + } + String defaultLiteral = fourthArgument.getLiteral().getStringValue(); + if (isSingleValue) { + try { + _dataType.convert(defaultLiteral); + } catch (Exception e) { + throw new IllegalArgumentException("Default value '" + defaultLiteral + "' is not a valid " + _dataType); + } + _defaultValueLiterals = new String[]{defaultLiteral}; + } else { + try { + JsonNode mvArray = JsonUtils.stringToJsonNode(defaultLiteral); + if (!mvArray.isArray()) { + throw new IllegalArgumentException("Default value must be a valid JSON array"); + } + String[] literals = new String[mvArray.size()]; + for (int i = 0; i < mvArray.size(); i++) { + literals[i] = mvArray.get(i).asText(); + try { + _dataType.convert(literals[i]); + } catch (Exception e) { + throw new IllegalArgumentException("Default value '" + literals[i] + "' is not a valid " + _dataType); + } + } + _defaultValueLiterals = literals; + } catch (IOException e) { + throw new IllegalArgumentException("Default value must be a valid JSON array"); + } + } + } else { + _defaultValueLiterals = null; + } - private DistinctResultsBlock buildDistinctResultsFromValues(ExpressionContext expr, ParsedJsonExtractIndex parsed, - Set distinctValues) { - ColumnDataType columnDataType = ColumnDataType.fromDataTypeSV(parsed._dataType); - DataSchema dataSchema = new DataSchema( - new String[]{expr.toString()}, - new ColumnDataType[]{columnDataType}); - OrderByExpressionContext orderByExpression = _queryContext.getOrderByExpressions() != null - ? _queryContext.getOrderByExpressions().get(0) : null; - DistinctTable distinctTable = createDistinctTable(dataSchema, parsed._dataType, orderByExpression); - int limit = _queryContext.getLimit(); + if (numArguments == 5) { + ExpressionContext fifthArgument = arguments.get(4); + if (fifthArgument.getType() != ExpressionContext.Type.LITERAL) { + throw new IllegalArgumentException("JSON path filter argument must be a literal"); + } + _filterJsonExpression = fifthArgument.getLiteral().getStringValue(); + } else { + _filterJsonExpression = null; + } - for (String value : distinctValues) { - _numEntriesExamined++; - QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExamined, EXPLAIN_NAME); + _dataSchema = new DataSchema(new String[]{_expression.toString()}, + new ColumnDataType[]{ColumnDataType.fromDataTypeSV(_dataType)}); + List orderByExpressions = queryContext.getOrderByExpressions(); + _orderByExpression = orderByExpressions != null ? orderByExpressions.get(0) : null; + } - boolean done = addValueToDistinctTable(distinctTable, value, parsed._dataType, orderByExpression); - if (done) { - break; - } - if (orderByExpression == null && distinctTable.hasLimit() && distinctTable.size() >= limit) { - break; + @Nullable + private static JsonIndexReader getJsonIndexReader(DataSource dataSource) { + JsonIndexReader reader = dataSource.getJsonIndex(); + // TODO: rework + if (reader == null) { + Optional> compositeIndex = IndexService.getInstance().getOptional("composite_json_index"); + if (compositeIndex.isPresent()) { + reader = (JsonIndexReader) dataSource.getIndex(compositeIndex.get()); } } + return reader; + } - return new DistinctResultsBlock(distinctTable, _queryContext); + @Override + protected DistinctResultsBlock getNextBlock() { + // Evaluate the filter first so we can skip the (potentially expensive) index map when no docs match. + BaseFilterOperator.FilteredDocIds filteredDocIds = _filterOperator.getFilteredDocIds(); + ImmutableRoaringBitmap docIds = filteredDocIds.getDocIds(); + _numDocsScanned = docIds != null ? docIds.getCardinality() : _totalDocs; + _numEntriesScannedInFilter = filteredDocIds.getNumEntriesScannedInFilter(); + if (_numDocsScanned == 0) { + return new DistinctResultsBlock(createDistinctTable(), _queryContext); + } + + // The 5-arg form's filter literal is pushed into the JSON index; WHERE-clause filters remain row-level and are + // applied after converting flattened doc IDs to real doc IDs. + Map valueToMatchingDocs = + _jsonIndexReader.getMatchingFlattenedDocsMap(_jsonPathString, _filterJsonExpression); + _jsonIndexReader.convertFlattenedDocIdsToDocIds(valueToMatchingDocs); + return buildDistinctResultsBlock(valueToMatchingDocs, docIds != null ? docIds.toRoaringBitmap() : null); } - private DistinctResultsBlock buildDistinctResultsBlock(ExpressionContext expr, ParsedJsonExtractIndex parsed, - Map valueToMatchingDocs, @Nullable RoaringBitmap filteredDocIds, - boolean allDocsSelected) { - ColumnDataType columnDataType = ColumnDataType.fromDataTypeSV(parsed._dataType); - DataSchema dataSchema = new DataSchema( - new String[]{expr.toString()}, - new ColumnDataType[]{columnDataType}); - OrderByExpressionContext orderByExpression = _queryContext.getOrderByExpressions() != null - ? _queryContext.getOrderByExpressions().get(0) : null; - DistinctTable distinctTable = createDistinctTable(dataSchema, parsed._dataType, orderByExpression); + private DistinctResultsBlock buildDistinctResultsBlock(Map valueToMatchingDocs, + @Nullable RoaringBitmap filteredDocIds) { + DistinctTable distinctTable = createDistinctTable(); - int limit = _queryContext.getLimit(); - int totalDocs = _indexSegment.getSegmentMetadata().getTotalDocs(); - RoaringBitmap coveredDocs = allDocsSelected ? new RoaringBitmap() : null; - RoaringBitmap remainingDocs = filteredDocIds != null ? filteredDocIds.clone() : null; - boolean allDocsCovered = filteredDocIds == null ? !allDocsSelected || totalDocs == 0 : filteredDocIds.isEmpty(); + // With _skipMissingPath, handleMissingDocs is bypassed — no need to track which docs are still uncovered, so + // skip the bitmap allocation and per-iteration `andNot` work entirely. + boolean allDocsCovered = _skipMissingPath; + RoaringBitmap remainingDocs = _skipMissingPath ? null + : (filteredDocIds != null ? filteredDocIds.clone() : RoaringBitmap.bitmapOfRange(0L, _totalDocs)); boolean earlyBreak = false; for (Map.Entry entry : valueToMatchingDocs.entrySet()) { - _numEntriesExamined++; - QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExamined, EXPLAIN_NAME); + QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExaminedPostFilter++, EXPLAIN_NAME); String value = entry.getKey(); RoaringBitmap docIds = entry.getValue(); - boolean includeValue; - if (filteredDocIds == null) { - includeValue = true; - if (!allDocsCovered && allDocsSelected) { - coveredDocs.or(docIds); - if (coveredDocs.getLongCardinality() >= totalDocs) { - allDocsCovered = true; - } - } - } else { - includeValue = RoaringBitmap.intersects(docIds, filteredDocIds); - // Remove matched docs from remaining set in-place (no allocation per value). - if (!allDocsCovered && includeValue) { - remainingDocs.andNot(docIds); - if (remainingDocs.isEmpty()) { - allDocsCovered = true; - } - } - } + // Unfiltered always includes; filtered must intersect the original filter set (not the shrinking + // `remainingDocs`, since a value can still belong to the result after all filtered docs are covered). + boolean includeValue = filteredDocIds == null || RoaringBitmap.intersects(docIds, filteredDocIds); - if (includeValue) { - boolean done = addValueToDistinctTable(distinctTable, value, parsed._dataType, orderByExpression); - if (done) { - earlyBreak = true; - break; + if (!allDocsCovered && includeValue) { + remainingDocs.andNot(docIds); + if (remainingDocs.isEmpty()) { + allDocsCovered = true; } } - if (orderByExpression == null && distinctTable.hasLimit() && distinctTable.size() >= limit) { + // addValueToDistinctTable returns true exactly when the table has reached its LIMIT (no-ORDER-BY case); + // for ORDER-BY or unbounded LIMIT it always returns false. So no separate size check is needed. + if (includeValue && addValueToDistinctTable(distinctTable, value)) { earlyBreak = true; break; } } if (!earlyBreak && !allDocsCovered) { - handleMissingDocs(distinctTable, parsed, orderByExpression); + handleMissingDocs(distinctTable); } return new DistinctResultsBlock(distinctTable, _queryContext); } - private void handleMissingDocs(DistinctTable distinctTable, ParsedJsonExtractIndex parsed, - @Nullable OrderByExpressionContext orderByExpression) { - if (parsed._defaultValueLiteral != null) { - addValueToDistinctTable(distinctTable, parsed._defaultValueLiteral, parsed._dataType, orderByExpression); - } else if (_queryContext.isNullHandlingEnabled()) { - distinctTable.addNull(); - } else { - throw new RuntimeException( - String.format("Illegal Json Path: [%s], for some docIds in segment [%s]", - parsed._jsonPathString, _indexSegment.getSegmentName())); - } - } - - @Nullable - private static String extractSamePathJsonMatchFilter(ParsedJsonExtractIndex parsed, @Nullable FilterContext filter) { - if (filter == null) { - return null; - } - switch (filter.getType()) { - case PREDICATE: - return extractSamePathJsonMatchFilter(parsed, filter.getPredicate()); - case AND: - String matchingFilter = null; - for (FilterContext child : filter.getChildren()) { - String childFilter = extractSamePathJsonMatchFilter(parsed, child); - if (childFilter == null) { - continue; - } - if (matchingFilter != null) { - return null; - } - matchingFilter = childFilter; - } - return matchingFilter; - default: - return null; - } - } - - private static boolean isOnlySamePathJsonMatchFilter(ParsedJsonExtractIndex parsed, @Nullable FilterContext filter) { - if (filter == null || filter.getType() != FilterContext.Type.PREDICATE) { - return false; - } - return extractSamePathJsonMatchFilter(parsed, filter.getPredicate()) != null; - } - - private static boolean jsonMatchFilterCanMatchMissingPath(String filterJsonString) { - try { - FilterContext filter = RequestContextUtils.getFilter(CalciteSqlParser.compileToExpression(filterJsonString)); - return filter.getType() == FilterContext.Type.PREDICATE - && filter.getPredicate().getType() == Predicate.Type.IS_NULL; - } catch (Exception e) { - return false; - } - } - - @Nullable - private static String extractSamePathJsonMatchFilter(ParsedJsonExtractIndex parsed, Predicate predicate) { - if (!(predicate instanceof JsonMatchPredicate)) { - return null; - } - ExpressionContext lhs = predicate.getLhs(); - if (lhs.getType() != ExpressionContext.Type.IDENTIFIER - || !parsed._columnName.equals(lhs.getIdentifier())) { - return null; - } - String filterJsonString = ((JsonMatchPredicate) predicate).getValue(); - int start = filterJsonString.indexOf('"'); - if (start < 0) { - return null; - } - int end = filterJsonString.indexOf('"', start + 1); - if (end < 0) { - return null; - } - String filterPath = filterJsonString.substring(start + 1, end); - return parsed._jsonPathString.equals(filterPath) ? filterJsonString : null; - } - - private DistinctTable createDistinctTable(DataSchema dataSchema, FieldSpec.DataType dataType, - @Nullable OrderByExpressionContext orderByExpression) { + private DistinctTable createDistinctTable() { int limit = _queryContext.getLimit(); boolean nullHandlingEnabled = _queryContext.isNullHandlingEnabled(); - switch (dataType) { + switch (_dataType) { case INT: - return new IntDistinctTable(dataSchema, limit, nullHandlingEnabled, orderByExpression); + return new IntDistinctTable(_dataSchema, limit, nullHandlingEnabled, _orderByExpression); case LONG: - return new LongDistinctTable(dataSchema, limit, nullHandlingEnabled, orderByExpression); + return new LongDistinctTable(_dataSchema, limit, nullHandlingEnabled, _orderByExpression); case FLOAT: - return new FloatDistinctTable(dataSchema, limit, nullHandlingEnabled, orderByExpression); + return new FloatDistinctTable(_dataSchema, limit, nullHandlingEnabled, _orderByExpression); case DOUBLE: - return new DoubleDistinctTable(dataSchema, limit, nullHandlingEnabled, orderByExpression); + return new DoubleDistinctTable(_dataSchema, limit, nullHandlingEnabled, _orderByExpression); case BIG_DECIMAL: - return new BigDecimalDistinctTable(dataSchema, limit, nullHandlingEnabled, orderByExpression); + return new BigDecimalDistinctTable(_dataSchema, limit, nullHandlingEnabled, _orderByExpression); case STRING: - return new StringDistinctTable(dataSchema, limit, nullHandlingEnabled, orderByExpression); + return new StringDistinctTable(_dataSchema, limit, nullHandlingEnabled, _orderByExpression); default: - throw new IllegalStateException("Unsupported data type for JSON index distinct: " + dataType); + throw new IllegalStateException("Unsupported data type for JSON index distinct: " + _dataType); } } - private static boolean addValueToDistinctTable(DistinctTable distinctTable, String stringValue, - FieldSpec.DataType dataType, @Nullable OrderByExpressionContext orderByExpression) { - switch (dataType) { + private boolean addValueToDistinctTable(DistinctTable distinctTable, String stringValue) { + switch (_dataType) { case INT: - return addToTable((IntDistinctTable) distinctTable, Integer.parseInt(stringValue), orderByExpression); + return addToTable((IntDistinctTable) distinctTable, Integer.parseInt(stringValue)); case LONG: - return addToTable((LongDistinctTable) distinctTable, Long.parseLong(stringValue), orderByExpression); + return addToTable((LongDistinctTable) distinctTable, Long.parseLong(stringValue)); case FLOAT: - return addToTable((FloatDistinctTable) distinctTable, Float.parseFloat(stringValue), orderByExpression); + return addToTable((FloatDistinctTable) distinctTable, Float.parseFloat(stringValue)); case DOUBLE: - return addToTable((DoubleDistinctTable) distinctTable, Double.parseDouble(stringValue), orderByExpression); + return addToTable((DoubleDistinctTable) distinctTable, Double.parseDouble(stringValue)); case BIG_DECIMAL: - return addToTable((BigDecimalDistinctTable) distinctTable, new BigDecimal(stringValue), orderByExpression); + return addToTable((BigDecimalDistinctTable) distinctTable, new BigDecimal(stringValue)); case STRING: - return addToTable((StringDistinctTable) distinctTable, stringValue, orderByExpression); + return addToTable((StringDistinctTable) distinctTable, stringValue); default: - throw new IllegalStateException("Unsupported data type for JSON index distinct: " + dataType); + throw new IllegalStateException("Unsupported data type for JSON index distinct: " + _dataType); } } - private static boolean addToTable(IntDistinctTable table, int value, - @Nullable OrderByExpressionContext orderByExpression) { - if (table.hasLimit()) { - if (orderByExpression != null) { - table.addWithOrderBy(value); - return false; - } else { - return table.addWithoutOrderBy(value); - } - } else { + private boolean addToTable(IntDistinctTable table, int value) { + if (!table.hasLimit()) { table.addUnbounded(value); return false; } - } - - private static boolean addToTable(LongDistinctTable table, long value, - @Nullable OrderByExpressionContext orderByExpression) { - if (table.hasLimit()) { - if (orderByExpression != null) { - table.addWithOrderBy(value); - return false; - } else { - return table.addWithoutOrderBy(value); - } - } else { - table.addUnbounded(value); + if (_orderByExpression != null) { + table.addWithOrderBy(value); return false; } + return table.addWithoutOrderBy(value); } - private static boolean addToTable(FloatDistinctTable table, float value, - @Nullable OrderByExpressionContext orderByExpression) { - if (table.hasLimit()) { - if (orderByExpression != null) { - table.addWithOrderBy(value); - return false; - } else { - return table.addWithoutOrderBy(value); - } - } else { + private boolean addToTable(LongDistinctTable table, long value) { + if (!table.hasLimit()) { table.addUnbounded(value); return false; } - } - - private static boolean addToTable(DoubleDistinctTable table, double value, - @Nullable OrderByExpressionContext orderByExpression) { - if (table.hasLimit()) { - if (orderByExpression != null) { - table.addWithOrderBy(value); - return false; - } else { - return table.addWithoutOrderBy(value); - } - } else { - table.addUnbounded(value); + if (_orderByExpression != null) { + table.addWithOrderBy(value); return false; } + return table.addWithoutOrderBy(value); } - private static boolean addToTable(BigDecimalDistinctTable table, BigDecimal value, - @Nullable OrderByExpressionContext orderByExpression) { - if (table.hasLimit()) { - if (orderByExpression != null) { - table.addWithOrderBy(value); - return false; - } else { - return table.addWithoutOrderBy(value); - } - } else { + private boolean addToTable(FloatDistinctTable table, float value) { + if (!table.hasLimit()) { table.addUnbounded(value); return false; } + if (_orderByExpression != null) { + table.addWithOrderBy(value); + return false; + } + return table.addWithoutOrderBy(value); } - private static boolean addToTable(StringDistinctTable table, String value, - @Nullable OrderByExpressionContext orderByExpression) { - if (table.hasLimit()) { - if (orderByExpression != null) { - table.addWithOrderBy(value); - return false; - } else { - return table.addWithoutOrderBy(value); - } - } else { + private boolean addToTable(DoubleDistinctTable table, double value) { + if (!table.hasLimit()) { table.addUnbounded(value); return false; } - } - - @Nullable - private static JsonIndexReader getJsonIndexReader(DataSource dataSource) { - JsonIndexReader reader = dataSource.getJsonIndex(); - if (reader == null) { - Optional> compositeIndex = - IndexService.getInstance().getOptional("composite_json_index"); - if (compositeIndex.isPresent()) { - reader = (JsonIndexReader) dataSource.getIndex(compositeIndex.get()); - } + if (_orderByExpression != null) { + table.addWithOrderBy(value); + return false; } - return reader; - } - - @Nullable - private RoaringBitmap buildFilteredDocIds() { - BaseFilterOperator.FilteredDocIds filteredDocIds = _filterOperator.getFilteredDocIds(); - _numEntriesScannedInFilter = filteredDocIds.getNumEntriesScannedInFilter(); - ImmutableRoaringBitmap docIds = filteredDocIds.getDocIds(); - return docIds != null ? docIds.toRoaringBitmap() : null; + return table.addWithoutOrderBy(value); } - @Nullable - private static ParsedJsonExtractIndex parseJsonExtractIndex(ExpressionContext expr) { - if (expr.getType() != ExpressionContext.Type.FUNCTION) { - return null; - } - if (!FUNCTION_NAME.equalsIgnoreCase(expr.getFunction().getFunctionName())) { - return null; - } - List args = expr.getFunction().getArguments(); - if (args.size() != 3 && args.size() != 4) { - return null; - } - if (args.get(0).getType() != ExpressionContext.Type.IDENTIFIER) { - return null; - } - if (args.get(1).getType() != ExpressionContext.Type.LITERAL - || args.get(2).getType() != ExpressionContext.Type.LITERAL - || (args.size() == 4 && args.get(3).getType() != ExpressionContext.Type.LITERAL)) { - return null; - } - - String columnName = args.get(0).getIdentifier(); - String jsonPathString = args.get(1).getLiteral().getStringValue(); - String resultsType = args.get(2).getLiteral().getStringValue().toUpperCase(); - // Only single-value types are supported; MV (_ARRAY) would have incorrect flattened-to-real - // docId intersection since convertFlattenedDocIdsToDocIds is skipped for MV. - if (resultsType.endsWith("_ARRAY")) { - return null; - } - if (jsonPathString.contains("[*]")) { - return null; - } - - FieldSpec.DataType dataType; - try { - dataType = FieldSpec.DataType.valueOf(resultsType); - } catch (IllegalArgumentException e) { - return null; + private boolean addToTable(BigDecimalDistinctTable table, BigDecimal value) { + if (!table.hasLimit()) { + table.addUnbounded(value); + return false; } - // Only types with a corresponding DistinctTable implementation are supported - switch (dataType) { - case INT: - case LONG: - case FLOAT: - case DOUBLE: - case BIG_DECIMAL: - case STRING: - break; - default: - return null; + if (_orderByExpression != null) { + table.addWithOrderBy(value); + return false; } + return table.addWithoutOrderBy(value); + } - try { - JsonPathCache.INSTANCE.getOrCompute(jsonPathString); - } catch (Exception e) { - return null; + private boolean addToTable(StringDistinctTable table, String value) { + if (!table.hasLimit()) { + table.addUnbounded(value); + return false; } - - String defaultValueLiteral = null; - if (args.size() == 4) { - defaultValueLiteral = args.get(3).getLiteral().getStringValue(); - try { - dataType.convert(defaultValueLiteral); - } catch (Exception e) { - return null; - } + if (_orderByExpression != null) { + table.addWithOrderBy(value); + return false; } - - return new ParsedJsonExtractIndex(columnName, jsonPathString, dataType, defaultValueLiteral); + return table.addWithoutOrderBy(value); } - private static final class ParsedJsonExtractIndex { - final String _columnName; - final String _jsonPathString; - final FieldSpec.DataType _dataType; - @Nullable - final String _defaultValueLiteral; - - ParsedJsonExtractIndex(String columnName, String jsonPathString, FieldSpec.DataType dataType, - @Nullable String defaultValueLiteral) { - _columnName = columnName; - _jsonPathString = jsonPathString; - _dataType = dataType; - _defaultValueLiteral = defaultValueLiteral; + private void handleMissingDocs(DistinctTable distinctTable) { + if (_defaultValueLiterals != null) { + for (String literal : _defaultValueLiterals) { + if (addValueToDistinctTable(distinctTable, literal)) { + return; + } + } + } else if (_queryContext.isNullHandlingEnabled()) { + distinctTable.addNull(); + } else { + throw new RuntimeException( + String.format("Illegal Json Path: [%s], for some docIds in segment [%s]", _jsonPathString, + _indexSegment.getSegmentName())); } } @Override public List getChildOperators() { - return Collections.singletonList(_filterOperator); + return List.of(_filterOperator); } @Override @@ -569,16 +446,16 @@ public IndexSegment getIndexSegment() { @Override public ExecutionStatistics getExecutionStatistics() { - int numTotalDocs = _indexSegment.getSegmentMetadata().getTotalDocs(); - // Index-only operator: no docs scanned, no entries scanned post-filter. - // Filter-phase stats are tracked when buildFilteredDocIds falls back to DocIdSetPlanNode. - return new ExecutionStatistics(0, _numEntriesScannedInFilter, 0, numTotalDocs); + // - numDocsScanned tracks the matching docs + // - numEntriesScannedInFilter tracks work done while materializing the exact filter bitmap + // - numEntriesScannedPostFilter tracks values examined + return new ExecutionStatistics(_numDocsScanned, _numEntriesScannedInFilter, _numEntriesExaminedPostFilter, + _totalDocs); } @Override public String toExplainString() { - List expressions = _queryContext.getSelectExpressions(); - return EXPLAIN_NAME + "(keyColumns:" + (expressions.isEmpty() ? "" : expressions.get(0).toString()) + ")"; + return EXPLAIN_NAME + "(keyColumns:" + _expression + ")"; } @Override @@ -589,34 +466,6 @@ protected String getExplainName() { @Override protected void explainAttributes(ExplainAttributeBuilder attributeBuilder) { super.explainAttributes(attributeBuilder); - List selectExpressions = _queryContext.getSelectExpressions(); - if (!selectExpressions.isEmpty()) { - attributeBuilder.putStringList("keyColumns", - List.of(selectExpressions.get(0).toString())); - } - } - - /** - * Returns true if the expression is the 3/4-arg scalar jsonExtractIndex form on a column with JSON index and the - * path is indexed. For OSS JSON index all paths are indexed. For composite JSON index, only paths in - * invertedIndexConfigs are indexed per key. - */ - public static boolean canUseJsonIndexDistinct(IndexSegment indexSegment, ExpressionContext expr) { - ParsedJsonExtractIndex parsed = parseJsonExtractIndex(expr); - if (parsed == null) { - return false; - } - DataSource dataSource = indexSegment.getDataSourceNullable(parsed._columnName); - if (dataSource == null) { - return false; - } - JsonIndexReader reader = getJsonIndexReader(dataSource); - if (reader == null) { - return false; - } - if (!reader.isPathIndexed(parsed._jsonPathString)) { - return false; - } - return true; + attributeBuilder.putStringList("keyColumns", List.of(_expression.toString())); } } diff --git a/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java b/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java index 97b34e05e200..cb3a53d1062f 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java @@ -24,10 +24,12 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import javax.annotation.Nullable; import org.apache.pinot.common.function.JsonPathCache; import org.apache.pinot.core.operator.ColumnContext; import org.apache.pinot.core.operator.blocks.ValueBlock; import org.apache.pinot.core.operator.transform.TransformResultMetadata; +import org.apache.pinot.segment.spi.datasource.DataSource; import org.apache.pinot.segment.spi.index.IndexService; import org.apache.pinot.segment.spi.index.IndexType; import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; @@ -45,14 +47,15 @@ public class JsonExtractIndexTransformFunction extends BaseTransformFunction { public static final String FUNCTION_NAME = "jsonExtractIndex"; - private TransformFunction _jsonFieldTransformFunction; - private String _jsonPathString; - private TransformResultMetadata _resultMetadata; private JsonIndexReader _jsonIndexReader; + private String _jsonPathString; + private boolean _isSingleValue; + @Nullable private Object _defaultValue; + @Nullable + private String _filterJsonExpression; + private TransformResultMetadata _resultMetadata; private Map _valueToMatchingDocsMap; - private boolean _isSingleValue; - private String _filterJsonPath; @Override public String getName() { @@ -62,8 +65,10 @@ public String getName() { @Override public void init(List arguments, Map columnContextMap) { super.init(arguments, columnContextMap); + + int numArguments = arguments.size(); // Check that there are exactly 3 or 4 or 5 arguments - if (arguments.size() < 3 || arguments.size() > 5) { + if (numArguments < 3 || numArguments > 5) { throw new IllegalArgumentException( "Expected 3/4/5 arguments for transform function: jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," + " ['defaultValue'], ['jsonFilterExpression'])"); @@ -71,14 +76,14 @@ public void init(List arguments, Map c TransformFunction firstArgument = arguments.get(0); if (firstArgument instanceof IdentifierTransformFunction) { - String columnName = ((IdentifierTransformFunction) firstArgument).getColumnName(); - _jsonIndexReader = columnContextMap.get(columnName).getDataSource().getJsonIndex(); - if (_jsonIndexReader == null) { //TODO: rework - Optional> compositeIndex = - IndexService.getInstance().getOptional("composite_json_index"); + DataSource dataSource = + columnContextMap.get(((IdentifierTransformFunction) firstArgument).getColumnName()).getDataSource(); + _jsonIndexReader = dataSource.getJsonIndex(); + // TODO: rework + if (_jsonIndexReader == null) { + Optional> compositeIndex = IndexService.getInstance().getOptional("composite_json_index"); if (compositeIndex.isPresent()) { - _jsonIndexReader = (JsonIndexReader) columnContextMap.get(columnName) - .getDataSource().getIndex(compositeIndex.get()); + _jsonIndexReader = (JsonIndexReader) dataSource.getIndex(compositeIndex.get()); } } if (_jsonIndexReader == null) { @@ -87,7 +92,6 @@ public void init(List arguments, Map c } else { throw new IllegalArgumentException("jsonExtractIndex can only be applied to a raw column"); } - _jsonFieldTransformFunction = firstArgument; TransformFunction secondArgument = arguments.get(1); if (!(secondArgument instanceof LiteralTransformFunction)) { @@ -113,12 +117,11 @@ public void init(List arguments, Map c DataType dataType = _isSingleValue ? DataType.valueOf(resultsType) : DataType.valueOf(resultsType.substring(0, resultsType.length() - 6)); - if (arguments.size() >= 4) { + if (numArguments >= 4) { TransformFunction fourthArgument = arguments.get(3); if (!(fourthArgument instanceof LiteralTransformFunction)) { throw new IllegalArgumentException("Default value must be a literal"); } - if (_isSingleValue) { _defaultValue = dataType.convert(((LiteralTransformFunction) fourthArgument).getStringLiteral()); } else { @@ -138,12 +141,12 @@ public void init(List arguments, Map c } } - if (arguments.size() == 5) { + if (numArguments == 5) { TransformFunction fifthArgument = arguments.get(4); if (!(fifthArgument instanceof LiteralTransformFunction)) { throw new IllegalArgumentException("JSON path filter argument must be a literal"); } - _filterJsonPath = ((LiteralTransformFunction) fifthArgument).getStringLiteral(); + _filterJsonExpression = ((LiteralTransformFunction) fifthArgument).getStringLiteral(); } _resultMetadata = new TransformResultMetadata(dataType, _isSingleValue, false); @@ -425,7 +428,7 @@ public String[][] transformToStringValuesMV(ValueBlock valueBlock) { */ private Map getValueToMatchingDocsMap() { if (_valueToMatchingDocsMap == null) { - _valueToMatchingDocsMap = _jsonIndexReader.getMatchingFlattenedDocsMap(_jsonPathString, _filterJsonPath); + _valueToMatchingDocsMap = _jsonIndexReader.getMatchingFlattenedDocsMap(_jsonPathString, _filterJsonExpression); if (_isSingleValue) { // For single value result type, it's more efficient to use original docIDs map _jsonIndexReader.convertFlattenedDocIdsToDocIds(_valueToMatchingDocsMap); diff --git a/pinot-core/src/main/java/org/apache/pinot/core/plan/DistinctPlanNode.java b/pinot-core/src/main/java/org/apache/pinot/core/plan/DistinctPlanNode.java index 7b35faa6ea78..54edba21a8fb 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/plan/DistinctPlanNode.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/plan/DistinctPlanNode.java @@ -84,9 +84,9 @@ public Operator run() { ExpressionContext expr = expressions.get(0); // JSON index path - if (JsonIndexDistinctOperator.canUseJsonIndexDistinct(_indexSegment, expr)) { + if (JsonIndexDistinctOperator.canUseJsonIndexDistinct(expr)) { BaseFilterOperator filterOperator = new FilterPlanNode(_segmentContext, _queryContext).run(); - return new JsonIndexDistinctOperator(_indexSegment, _segmentContext, _queryContext, filterOperator); + return new JsonIndexDistinctOperator(_indexSegment, _queryContext, filterOperator); } // Inverted/sorted index path. For unsorted dictionaries the operator still avoids the scan/projection path, diff --git a/pinot-core/src/test/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperatorUnitTest.java b/pinot-core/src/test/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperatorUnitTest.java deleted file mode 100644 index 48408cdcc9de..000000000000 --- a/pinot-core/src/test/java/org/apache/pinot/core/operator/query/InvertedIndexDistinctOperatorUnitTest.java +++ /dev/null @@ -1,233 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.core.operator.query; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import org.apache.pinot.common.request.context.ExpressionContext; -import org.apache.pinot.core.common.BlockDocIdSet; -import org.apache.pinot.core.common.Operator; -import org.apache.pinot.core.operator.BaseProjectOperator; -import org.apache.pinot.core.operator.ColumnContext; -import org.apache.pinot.core.operator.DocIdOrderedOperator.DocIdOrder; -import org.apache.pinot.core.operator.ProjectionOperator; -import org.apache.pinot.core.operator.ProjectionOperatorUtils; -import org.apache.pinot.core.operator.blocks.ProjectionBlock; -import org.apache.pinot.core.operator.blocks.results.DistinctResultsBlock; -import org.apache.pinot.core.operator.docidsets.MatchAllDocIdSet; -import org.apache.pinot.core.operator.filter.BaseFilterOperator; -import org.apache.pinot.core.operator.filter.BitmapCollection; -import org.apache.pinot.core.query.request.context.QueryContext; -import org.apache.pinot.core.query.request.context.utils.QueryContextConverterUtils; -import org.apache.pinot.segment.spi.IndexSegment; -import org.apache.pinot.segment.spi.SegmentContext; -import org.apache.pinot.segment.spi.SegmentMetadata; -import org.apache.pinot.segment.spi.datasource.DataSource; -import org.apache.pinot.segment.spi.datasource.DataSourceMetadata; -import org.apache.pinot.segment.spi.index.reader.Dictionary; -import org.apache.pinot.segment.spi.index.reader.InvertedIndexReader; -import org.apache.pinot.spi.data.FieldSpec.DataType; -import org.testng.annotations.Test; - -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.verifyNoInteractions; -import static org.mockito.Mockito.when; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertNotNull; - - -/** - * Unit tests for {@link InvertedIndexDistinctOperator}. - */ -public class InvertedIndexDistinctOperatorUnitTest { - - @Test - public void testScanFallbackDoesNotMaterializeBitmapWhenCountIsAvailable() { - QueryContext queryContext = QueryContextConverterUtils.getQueryContext( - "SELECT DISTINCT testColumn FROM testTable " - + "OPTION(useIndexBasedDistinctOperator=true, invertedIndexDistinctCostRatio=1000)"); - - Dictionary dictionary = mock(Dictionary.class); - when(dictionary.length()).thenReturn(100); - when(dictionary.getValueType()).thenReturn(DataType.INT); - - DataSourceMetadata dataSourceMetadata = mock(DataSourceMetadata.class); - when(dataSourceMetadata.getDataType()).thenReturn(DataType.INT); - when(dataSourceMetadata.isSingleValue()).thenReturn(true); - - DataSource dataSource = mock(DataSource.class); - when(dataSource.getDictionary()).thenReturn(dictionary); - when(dataSource.getDataSourceMetadata()).thenReturn(dataSourceMetadata); - @SuppressWarnings("rawtypes") - InvertedIndexReader invertedIndexReader = mock(InvertedIndexReader.class); - when(dataSource.getInvertedIndex()).thenReturn(invertedIndexReader); - - SegmentMetadata segmentMetadata = mock(SegmentMetadata.class); - when(segmentMetadata.getTotalDocs()).thenReturn(10); - - IndexSegment indexSegment = mock(IndexSegment.class); - when(indexSegment.getSegmentMetadata()).thenReturn(segmentMetadata); - when(indexSegment.getDataSource(eq("testColumn"), any())).thenReturn(dataSource); - - ColumnContext columnContext = ColumnContext.fromDataSource(dataSource); - ProjectionOperatorUtils.setImplementation((dataSourceMap, docIdSetOperator, ignoredQueryContext) -> - new EmptyProjectionOperator(ignoredQueryContext, "testColumn", columnContext)); - try { - DistinctResultsBlock resultsBlock = - new InvertedIndexDistinctOperator(indexSegment, new SegmentContext(indexSegment), queryContext, - new CountOptimizedBitmapCapableFilterOperator(10, 5), dataSource).nextBlock(); - - assertNotNull(resultsBlock); - verifyNoInteractions(invertedIndexReader); - } finally { - ProjectionOperatorUtils.setImplementation(new ProjectionOperatorUtils.DefaultImplementation()); - } - } - - @Test - public void testEmptyCountOptimizedFilterShortCircuitsWithoutProjection() { - QueryContext queryContext = QueryContextConverterUtils.getQueryContext( - "SELECT DISTINCT testColumn FROM testTable " - + "OPTION(useIndexBasedDistinctOperator=true)"); - - Dictionary dictionary = mock(Dictionary.class); - when(dictionary.length()).thenReturn(100); - when(dictionary.getValueType()).thenReturn(DataType.INT); - - DataSourceMetadata dataSourceMetadata = mock(DataSourceMetadata.class); - when(dataSourceMetadata.getDataType()).thenReturn(DataType.INT); - when(dataSourceMetadata.isSingleValue()).thenReturn(true); - - DataSource dataSource = mock(DataSource.class); - when(dataSource.getDictionary()).thenReturn(dictionary); - when(dataSource.getDataSourceMetadata()).thenReturn(dataSourceMetadata); - @SuppressWarnings("rawtypes") - InvertedIndexReader invertedIndexReader = mock(InvertedIndexReader.class); - when(dataSource.getInvertedIndex()).thenReturn(invertedIndexReader); - - SegmentMetadata segmentMetadata = mock(SegmentMetadata.class); - when(segmentMetadata.getTotalDocs()).thenReturn(10); - - IndexSegment indexSegment = mock(IndexSegment.class); - when(indexSegment.getSegmentMetadata()).thenReturn(segmentMetadata); - when(indexSegment.getDataSource(eq("testColumn"), any())).thenReturn(dataSource); - - ProjectionOperatorUtils.setImplementation((dataSourceMap, docIdSetOperator, ignoredQueryContext) -> { - throw new AssertionError("Empty result should short-circuit before building projection"); - }); - try { - DistinctResultsBlock resultsBlock = - new InvertedIndexDistinctOperator(indexSegment, new SegmentContext(indexSegment), queryContext, - new CountOptimizedFilterOperator(10, 0), dataSource).nextBlock(); - - assertNotNull(resultsBlock); - assertEquals(resultsBlock.getNumRows(), 0); - verifyNoInteractions(invertedIndexReader); - } finally { - ProjectionOperatorUtils.setImplementation(new ProjectionOperatorUtils.DefaultImplementation()); - } - } - - private static class CountOptimizedFilterOperator extends BaseFilterOperator { - private final int _numMatchingDocs; - - private CountOptimizedFilterOperator(int numDocs, int numMatchingDocs) { - super(numDocs, false); - _numMatchingDocs = numMatchingDocs; - } - - @Override - public boolean canOptimizeCount() { - return true; - } - - @Override - public int getNumMatchingDocs() { - return _numMatchingDocs; - } - - @Override - public FilteredDocIds getFilteredDocIds() { - throw new AssertionError("Scan fallback should not materialize filtered doc ids for count-optimized filters"); - } - - @Override - protected BlockDocIdSet getTrues() { - return new MatchAllDocIdSet(_numDocs); - } - - @Override - public String toExplainString() { - return "COUNT_OPTIMIZED_TEST_FILTER"; - } - - @Override - public List getChildOperators() { - return Collections.emptyList(); - } - } - - private static final class CountOptimizedBitmapCapableFilterOperator extends CountOptimizedFilterOperator { - private CountOptimizedBitmapCapableFilterOperator(int numDocs, int numMatchingDocs) { - super(numDocs, numMatchingDocs); - } - - @Override - public boolean canProduceBitmaps() { - return true; - } - - @Override - public BitmapCollection getBitmaps() { - throw new AssertionError("Count-optimized filters should not eagerly materialize bitmaps"); - } - } - - private static final class EmptyProjectionOperator extends ProjectionOperator { - private final Map _columnContextMap; - - private EmptyProjectionOperator(QueryContext queryContext, String column, ColumnContext columnContext) { - super(Collections.emptyMap(), null, queryContext); - _columnContextMap = Map.of(column, columnContext); - } - - @Override - public Map getSourceColumnContextMap() { - return _columnContextMap; - } - - @Override - public ColumnContext getResultColumnContext(ExpressionContext expression) { - return _columnContextMap.get(expression.getIdentifier()); - } - - @Override - protected ProjectionBlock getNextBlock() { - return null; - } - - @Override - public BaseProjectOperator withOrder(DocIdOrder newOrder) { - return this; - } - } -} diff --git a/pinot-core/src/test/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperatorTest.java b/pinot-core/src/test/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperatorTest.java deleted file mode 100644 index bb8431d0707a..000000000000 --- a/pinot-core/src/test/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperatorTest.java +++ /dev/null @@ -1,306 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.pinot.core.operator.query; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.pinot.core.common.Operator; -import org.apache.pinot.core.operator.blocks.results.DistinctResultsBlock; -import org.apache.pinot.core.operator.filter.BaseFilterOperator; -import org.apache.pinot.core.operator.filter.BitmapCollection; -import org.apache.pinot.core.query.request.context.QueryContext; -import org.apache.pinot.core.query.request.context.utils.QueryContextConverterUtils; -import org.apache.pinot.segment.spi.IndexSegment; -import org.apache.pinot.segment.spi.SegmentContext; -import org.apache.pinot.segment.spi.SegmentMetadata; -import org.apache.pinot.segment.spi.datasource.DataSource; -import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; -import org.roaringbitmap.RoaringBitmap; -import org.roaringbitmap.buffer.MutableRoaringBitmap; -import org.testng.annotations.Test; - -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; -import static org.testng.Assert.expectThrows; - - -/** - * Unit tests for {@link JsonIndexDistinctOperator}. - */ -public class JsonIndexDistinctOperatorTest { - private static final String STRING_EXTRACT = "JSON_EXTRACT_INDEX(tags, '$.instance', 'STRING')"; - private static final String STRING_EXTRACT_WITH_EMPTY_DEFAULT = - "JSON_EXTRACT_INDEX(tags, '$.instance', 'STRING', '')"; - private static final String STRING_EXTRACT_WITH_DEFAULT = - "JSON_EXTRACT_INDEX(tags, '$.instance', 'STRING', 'missing')"; - private static final String INVALID_INT_DEFAULT_EXTRACT = - "JSON_EXTRACT_INDEX(tags, '$.instance', 'INT', 'abc')"; - private static final String SAME_PATH_FILTER = "REGEXP_LIKE(\"$.instance\", '.*test.*')"; - private static final String CROSS_PATH_FILTER = "REGEXP_LIKE(\"$.env\", 'prod.*')"; - private static final String SAME_PATH_IS_NULL_FILTER = "\"$.instance\" IS NULL"; - - @Test - public void testSamePathJsonMatchUsesDistinctValuesFastPathForFourArgScalarForm() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT_WITH_EMPTY_DEFAULT, SAME_PATH_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - when(jsonIndexReader.getMatchingDistinctValues("$.instance", SAME_PATH_FILTER)) - .thenReturn(Set.of("test-east", "test-west")); - - DistinctResultsBlock resultsBlock = - buildOperator(queryContext, jsonIndexReader, bufferBitmap(0, 1), 2).nextBlock(); - - assertEquals(extractValues(resultsBlock), Set.of("test-east", "test-west")); - verify(jsonIndexReader).getMatchingDistinctValues("$.instance", SAME_PATH_FILTER); - verify(jsonIndexReader, never()).getMatchingFlattenedDocsMap(any(), any()); - verify(jsonIndexReader, never()).convertFlattenedDocIdsToDocIds(any()); - } - - @Test - public void testSamePathJsonMatchUsesDistinctValuesFastPathForThreeArgScalarForm() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT, SAME_PATH_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - when(jsonIndexReader.getMatchingDistinctValues("$.instance", SAME_PATH_FILTER)) - .thenReturn(Set.of("test-east", "test-west")); - - DistinctResultsBlock resultsBlock = - buildOperator(queryContext, jsonIndexReader, bufferBitmap(0, 1), 2).nextBlock(); - - assertEquals(extractValues(resultsBlock), Set.of("test-east", "test-west")); - verify(jsonIndexReader).getMatchingDistinctValues("$.instance", SAME_PATH_FILTER); - verify(jsonIndexReader, never()).getMatchingFlattenedDocsMap(any(), any()); - verify(jsonIndexReader, never()).convertFlattenedDocIdsToDocIds(any()); - } - - @Test - public void testDifferentPathJsonMatchIsAppliedAtDocLevel() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT, CROSS_PATH_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - Map flattenedDocsByValue = new HashMap<>(); - flattenedDocsByValue.put("prod-a", bitmap(100)); - flattenedDocsByValue.put("prod-b", bitmap(200)); - flattenedDocsByValue.put("other-doc", bitmap(300)); - when(jsonIndexReader.getMatchingFlattenedDocsMap("$.instance", null)).thenReturn(flattenedDocsByValue); - stubConvertedDocIds(jsonIndexReader, Map.of("prod-a", bitmap(0), "prod-b", bitmap(1), "other-doc", bitmap(2))); - - DistinctResultsBlock resultsBlock = - buildOperator(queryContext, jsonIndexReader, bufferBitmap(0, 1), 3).nextBlock(); - - assertEquals(extractValues(resultsBlock), Set.of("prod-a", "prod-b")); - verify(jsonIndexReader).getMatchingFlattenedDocsMap("$.instance", null); - verify(jsonIndexReader, never()).getMatchingFlattenedDocsMap("$.instance", - "REGEXP_LIKE(\"$.env\", ''prod.*'')"); - verify(jsonIndexReader).convertFlattenedDocIdsToDocIds(any()); - } - - @Test - public void testCanUseJsonIndexDistinctAllowsThreeArgScalarForm() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT, CROSS_PATH_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - when(jsonIndexReader.isPathIndexed("$.instance")).thenReturn(true); - IndexSegment indexSegment = buildCanUseIndexSegment(jsonIndexReader); - - assertTrue(JsonIndexDistinctOperator.canUseJsonIndexDistinct(indexSegment, - queryContext.getSelectExpressions().get(0))); - } - - @Test - public void testCanUseJsonIndexDistinctAllowsFourArgScalarForm() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT_WITH_EMPTY_DEFAULT, CROSS_PATH_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - when(jsonIndexReader.isPathIndexed("$.instance")).thenReturn(true); - IndexSegment indexSegment = buildCanUseIndexSegment(jsonIndexReader); - - assertTrue(JsonIndexDistinctOperator.canUseJsonIndexDistinct(indexSegment, - queryContext.getSelectExpressions().get(0))); - } - - @Test - public void testCanUseJsonIndexDistinctRejectsInvalidDefaultArgument() { - QueryContext queryContext = distinctQuery(INVALID_INT_DEFAULT_EXTRACT, CROSS_PATH_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - when(jsonIndexReader.isPathIndexed("$.instance")).thenReturn(true); - IndexSegment indexSegment = buildCanUseIndexSegment(jsonIndexReader); - - assertFalse(JsonIndexDistinctOperator.canUseJsonIndexDistinct(indexSegment, - queryContext.getSelectExpressions().get(0))); - } - - @Test - public void testFourArgAddsDefaultForDocsWithoutJsonPath() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT_WITH_DEFAULT, CROSS_PATH_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - Map flattenedDocsByValue = new HashMap<>(); - flattenedDocsByValue.put("prod-a", bitmap(100)); - flattenedDocsByValue.put("prod-b", bitmap(200)); - when(jsonIndexReader.getMatchingFlattenedDocsMap("$.instance", null)).thenReturn(flattenedDocsByValue); - stubConvertedDocIds(jsonIndexReader, Map.of("prod-a", bitmap(0), "prod-b", bitmap(1))); - - DistinctResultsBlock resultsBlock = - buildOperator(queryContext, jsonIndexReader, bufferBitmap(0, 1, 2), 3).nextBlock(); - - assertEquals(extractValues(resultsBlock), Set.of("prod-a", "prod-b", "missing")); - } - - @Test - public void testSamePathIsNullStillAddsDefaultForMissingPath() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT_WITH_DEFAULT, SAME_PATH_IS_NULL_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - when(jsonIndexReader.getMatchingFlattenedDocsMap("$.instance", SAME_PATH_IS_NULL_FILTER)).thenReturn( - new HashMap<>()); - - DistinctResultsBlock resultsBlock = - buildOperator(queryContext, jsonIndexReader, bufferBitmap(2), 3).nextBlock(); - - assertEquals(extractValues(resultsBlock), Set.of("missing")); - verify(jsonIndexReader).getMatchingFlattenedDocsMap("$.instance", SAME_PATH_IS_NULL_FILTER); - verify(jsonIndexReader).convertFlattenedDocIdsToDocIds(any()); - } - - @Test - public void testMissingPathWithoutDefaultThrows() { - QueryContext queryContext = distinctQuery(STRING_EXTRACT, SAME_PATH_IS_NULL_FILTER); - - JsonIndexReader jsonIndexReader = mock(JsonIndexReader.class); - when(jsonIndexReader.getMatchingFlattenedDocsMap("$.instance", SAME_PATH_IS_NULL_FILTER)).thenReturn( - new HashMap<>()); - - RuntimeException exception = expectThrows(RuntimeException.class, - () -> buildOperator(queryContext, jsonIndexReader, bufferBitmap(2), 3).nextBlock()); - - assertTrue(exception.getMessage().contains("Illegal Json Path")); - } - - private static QueryContext distinctQuery(String expression, String filterJsonString) { - return QueryContextConverterUtils.getQueryContext( - "SELECT DISTINCT " + expression + " AS tag_value FROM myTable WHERE JSON_MATCH(tags, '" - + filterJsonString.replace("'", "''") + "')"); - } - - private static void stubConvertedDocIds(JsonIndexReader jsonIndexReader, - Map convertedDocIds) { - doAnswer(invocation -> { - @SuppressWarnings("unchecked") - Map docsByValue = (Map) invocation.getArgument(0); - docsByValue.clear(); - docsByValue.putAll(convertedDocIds); - return null; - }).when(jsonIndexReader).convertFlattenedDocIdsToDocIds(any()); - } - - private static IndexSegment buildCanUseIndexSegment(JsonIndexReader jsonIndexReader) { - DataSource dataSource = mock(DataSource.class); - when(dataSource.getJsonIndex()).thenReturn(jsonIndexReader); - - IndexSegment indexSegment = mock(IndexSegment.class); - when(indexSegment.getDataSourceNullable("tags")).thenReturn(dataSource); - return indexSegment; - } - - private static JsonIndexDistinctOperator buildOperator(QueryContext queryContext, JsonIndexReader jsonIndexReader, - MutableRoaringBitmap filterBitmap, int numDocs) { - SegmentMetadata segmentMetadata = mock(SegmentMetadata.class); - when(segmentMetadata.getTotalDocs()).thenReturn(numDocs); - - DataSource dataSource = mock(DataSource.class); - when(dataSource.getJsonIndex()).thenReturn(jsonIndexReader); - - IndexSegment indexSegment = mock(IndexSegment.class); - when(indexSegment.getSegmentMetadata()).thenReturn(segmentMetadata); - when(indexSegment.getSegmentName()).thenReturn("testSegment"); - when(indexSegment.getDataSource(eq("tags"), any())).thenReturn(dataSource); - when(indexSegment.getDataSourceNullable("tags")).thenReturn(dataSource); - - return new JsonIndexDistinctOperator(indexSegment, new SegmentContext(indexSegment), queryContext, - new StaticBitmapFilterOperator(numDocs, filterBitmap)); - } - - private static RoaringBitmap bitmap(int... docIds) { - RoaringBitmap bitmap = new RoaringBitmap(); - for (int docId : docIds) { - bitmap.add(docId); - } - return bitmap; - } - - private static MutableRoaringBitmap bufferBitmap(int... docIds) { - MutableRoaringBitmap bitmap = new MutableRoaringBitmap(); - for (int docId : docIds) { - bitmap.add(docId); - } - return bitmap; - } - - private static Set extractValues(DistinctResultsBlock resultsBlock) { - List rows = resultsBlock.getRows(); - return rows.stream().map(row -> (String) row[0]).collect(Collectors.toSet()); - } - - private static final class StaticBitmapFilterOperator extends BaseFilterOperator { - private final MutableRoaringBitmap _bitmap; - - StaticBitmapFilterOperator(int numDocs, MutableRoaringBitmap bitmap) { - super(numDocs, false); - _bitmap = bitmap; - } - - @Override - public boolean canProduceBitmaps() { - return true; - } - - @Override - public BitmapCollection getBitmaps() { - return new BitmapCollection(_numDocs, false, _bitmap); - } - - @Override - public List getChildOperators() { - return List.of(); - } - - @Override - protected org.apache.pinot.core.common.BlockDocIdSet getTrues() { - throw new UnsupportedOperationException("Bitmap path only"); - } - - @Override - public String toExplainString() { - return "STATIC_BITMAP_FILTER"; - } - } -} diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/InvertedIndexDistinctOperatorTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/InvertedIndexDistinctOperatorQueriesTest.java similarity index 73% rename from pinot-core/src/test/java/org/apache/pinot/queries/InvertedIndexDistinctOperatorTest.java rename to pinot-core/src/test/java/org/apache/pinot/queries/InvertedIndexDistinctOperatorQueriesTest.java index 78708dd79b05..f7744fda7dcf 100644 --- a/pinot-core/src/test/java/org/apache/pinot/queries/InvertedIndexDistinctOperatorTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/queries/InvertedIndexDistinctOperatorQueriesTest.java @@ -27,6 +27,7 @@ import org.apache.commons.io.FileUtils; import org.apache.pinot.common.response.broker.ResultTable; import org.apache.pinot.core.operator.BaseOperator; +import org.apache.pinot.core.operator.ExecutionStatistics; import org.apache.pinot.core.operator.blocks.results.DistinctResultsBlock; import org.apache.pinot.core.query.distinct.table.DistinctTable; import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader; @@ -54,30 +55,36 @@ import static org.testng.Assert.assertTrue; -/** - * Tests for {@link org.apache.pinot.core.operator.query.InvertedIndexDistinctOperator}. - * - *

Six segments exercise distinct operator features: - *

    - *
  • INT segment: 10K records, 100 unique INT values (interleaved), inverted index. - * Tests cost heuristic path selection and inverted-vs-scan correctness.
  • - *
  • MV segment: 1K records, MV INT column (50 unique values), inverted index. - * Tests multi-value column support.
  • - *
  • Sorted segment: 10K records, sorted INT column (100 unique), sorted forward index. - * Tests sorted index path.
  • - *
  • Mutable segment: consuming segment with unsorted dictionary + inverted index. - * Tests ORDER BY correctness without relying on sorted dictIds.
  • - *
  • STRING segment: 5K records, STRING column (50 unique), inverted index. - * Tests STRING data type handling.
  • - *
  • Null segment: 1K records, INT column with nulls, inverted index. - * Tests null handling.
  • - *
- */ -public class InvertedIndexDistinctOperatorTest extends BaseQueriesTest { +/// Tests for [org.apache.pinot.core.operator.query.InvertedIndexDistinctOperator]. +/// +/// Six segments exercise distinct operator features: +/// - **INT segment**: 10K records, 100 unique INT values (interleaved), inverted index. +/// Tests cost heuristic path selection and inverted-vs-scan correctness. +/// - **MV segment**: 1K records, MV INT column (50 unique values), inverted index. +/// Tests multi-value column support. +/// - **Sorted segment**: 10K records, sorted INT column (100 unique), sorted forward index. +/// Tests sorted index path. +/// - **Mutable segment**: consuming segment with unsorted dictionary + inverted index. +/// Tests ORDER BY correctness without relying on sorted dictIds. +/// - **STRING segment**: 5K records, STRING column (50 unique), inverted index. +/// Tests STRING data type handling. +/// - **Null segment**: 1K records, INT column with nulls, inverted index. +/// Tests null handling. +public class InvertedIndexDistinctOperatorQueriesTest extends BaseQueriesTest { private static final File INDEX_DIR = - new File(FileUtils.getTempDirectory(), "InvertedIndexDistinctOperatorTest"); + new File(FileUtils.getTempDirectory(), "InvertedIndexDistinctOperatorQueriesTest"); private static final String RAW_TABLE_NAME = "testTable"; + private static final String OPT = "SET useIndexBasedDistinctOperator=true; "; + private static final String OPT_INV = OPT + "SET invertedIndexDistinctCostRatio=0; "; + private static final String OPT_SCAN = OPT + "SET invertedIndexDistinctCostRatio=100000; "; + private static final String OPT_INV_NULLS = OPT_INV + "SET enableNullHandling=true; "; + private static final String OPT_SCAN_NULLS = OPT_SCAN + "SET enableNullHandling=true; "; + + private static String optWithRatio(int ratio) { + return OPT + "SET invertedIndexDistinctCostRatio=" + ratio + "; "; + } + // Active segment — swapped per test group private IndexSegment _activeSegment; private final List _allSegments = new ArrayList<>(); @@ -386,58 +393,40 @@ private boolean containsNull(ResultTable resultTable) { return false; } - private static final String OPT = "OPTION(useIndexBasedDistinctOperator=true"; - private static final String OPT_INV = OPT + ", invertedIndexDistinctCostRatio=1)"; - private static final String OPT_SCAN = OPT + ", invertedIndexDistinctCostRatio=100000)"; - // ==================== Cost Heuristic Tests ==================== @Test public void testCostRatioPathSelection() { _activeSegment = _intSegment; + String wideQuery = "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0"; + String selectiveQuery = "SELECT DISTINCT intColumn FROM testTable WHERE intColumn = 0"; // Without the query option → old DistinctOperator - assertFalse(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0"))); + assertFalse(usedInvertedIndex(runDistinct(wideQuery))); // costRatio=1, wide filter (10K docs): 100*1 <= 10000 → inverted - assertTrue(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 " - + OPT + ", invertedIndexDistinctCostRatio=1)"))); + assertTrue(usedInvertedIndex(runDistinct(optWithRatio(1) + wideQuery))); // costRatio=200, wide filter: 100*200=20000 > 10000 → scan - assertFalse(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 " - + OPT + ", invertedIndexDistinctCostRatio=200)"))); + assertFalse(usedInvertedIndex(runDistinct(optWithRatio(200) + wideQuery))); // costRatio=1, selective filter (100 docs): 100*1 <= 100 → inverted - assertTrue(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn = 0 " - + OPT + ", invertedIndexDistinctCostRatio=1)"))); + assertTrue(usedInvertedIndex(runDistinct(optWithRatio(1) + selectiveQuery))); // costRatio=2, selective filter: 100*2=200 > 100 → scan - assertFalse(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn = 0 " - + OPT + ", invertedIndexDistinctCostRatio=2)"))); + assertFalse(usedInvertedIndex(runDistinct(optWithRatio(2) + selectiveQuery))); // costRatio=0: force inverted index for non-empty filters - assertTrue(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn = 0 " - + OPT + ", invertedIndexDistinctCostRatio=0)"))); + assertTrue(usedInvertedIndex(runDistinct(optWithRatio(0) + selectiveQuery))); // Default costRatio=30: 100*30=3000 <= 10K → inverted - assertTrue(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 " + OPT + ")"))); + assertTrue(usedInvertedIndex(runDistinct(OPT + wideQuery))); // Boundary: costRatio=100: 100*100=10000 <= 10000 → inverted - assertTrue(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 " - + OPT + ", invertedIndexDistinctCostRatio=100)"))); + assertTrue(usedInvertedIndex(runDistinct(optWithRatio(100) + wideQuery))); // Above boundary: costRatio=101: 100*101=10100 > 10000 → scan - assertFalse(usedInvertedIndex(runDistinct( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 " - + OPT + ", invertedIndexDistinctCostRatio=101)"))); + assertFalse(usedInvertedIndex(runDistinct(optWithRatio(101) + wideQuery))); } @Test @@ -445,15 +434,13 @@ public void testInvertedIndexVsScanCorrectness() { _activeSegment = _intSegment; // With ORDER BY - BaseOperator invertedOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn IN " - + "(0,1,2,3,4,5,6,7,8,9) ORDER BY intColumn LIMIT 100 " + OPT_INV); + String orderByQuery = "SELECT DISTINCT intColumn FROM testTable WHERE intColumn IN (0,1,2,3,4,5,6,7,8,9) " + + "ORDER BY intColumn LIMIT 100"; + BaseOperator invertedOp = getOperator(OPT_INV + orderByQuery); DistinctTable invertedTable = invertedOp.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(invertedOp)); - BaseOperator scanOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn IN " - + "(0,1,2,3,4,5,6,7,8,9) ORDER BY intColumn LIMIT 100 " + OPT_SCAN); + BaseOperator scanOp = getOperator(OPT_SCAN + orderByQuery); DistinctTable scanTable = scanOp.nextBlock().getDistinctTable(); assertFalse(usedInvertedIndex(scanOp)); @@ -465,12 +452,11 @@ public void testInvertedIndexVsScanCorrectness() { assertEquals(extractIntValues(scanTable), expected); // Without ORDER BY — same count - BaseOperator inv2 = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 LIMIT 200 " + OPT_INV); + String noOrderByQuery = "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 LIMIT 200"; + BaseOperator inv2 = getOperator(OPT_INV + noOrderByQuery); assertEquals(inv2.nextBlock().getDistinctTable().size(), INT_NUM_UNIQUE); - BaseOperator scan2 = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 LIMIT 200 " + OPT_SCAN); + BaseOperator scan2 = getOperator(OPT_SCAN + noOrderByQuery); assertEquals(scan2.nextBlock().getDistinctTable().size(), INT_NUM_UNIQUE); } @@ -481,7 +467,7 @@ public void testMvColumnWithFilter() { _activeSegment = _mvSegment; BaseOperator op = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 LIMIT 1000 " + OPT_INV); + OPT_INV + "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 LIMIT 1000"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(op)); assertEquals(extractIntValues(table), _filteredMvValues); @@ -490,16 +476,14 @@ public void testMvColumnWithFilter() { @Test public void testMvColumnInvertedVsScan() { _activeSegment = _mvSegment; + String query = "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 " + + "ORDER BY mvIntColumn LIMIT 1000"; - BaseOperator invertedOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 " - + "ORDER BY mvIntColumn LIMIT 1000 " + OPT_INV); + BaseOperator invertedOp = getOperator(OPT_INV + query); DistinctTable invertedTable = invertedOp.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(invertedOp)); - BaseOperator scanOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 " - + "ORDER BY mvIntColumn LIMIT 1000 " + OPT_SCAN); + BaseOperator scanOp = getOperator(OPT_SCAN + query); DistinctTable scanTable = scanOp.nextBlock().getDistinctTable(); assertFalse(usedInvertedIndex(scanOp)); @@ -511,7 +495,7 @@ public void testMvColumnMatchAll() { _activeSegment = _mvSegment; BaseOperator matchAllOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn >= 0 LIMIT 1000 " + OPT_INV); + OPT_INV + "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn >= 0 LIMIT 1000"); DistinctTable matchAllTable = matchAllOp.nextBlock().getDistinctTable(); assertEquals(extractIntValues(matchAllTable), _allMvValues); } @@ -521,20 +505,18 @@ public void testMvColumnLimit() { _activeSegment = _mvSegment; BaseOperator limitOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn >= 0 LIMIT 10 " + OPT_INV); + OPT_INV + "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn >= 0 LIMIT 10"); assertEquals(limitOp.nextBlock().getDistinctTable().size(), 10); } @Test public void testMvColumnOrderByDesc() { _activeSegment = _mvSegment; + String query = "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 " + + "ORDER BY mvIntColumn DESC LIMIT 1000"; - BaseOperator descOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 " - + "ORDER BY mvIntColumn DESC LIMIT 1000 " + OPT_INV); - BaseOperator descScanOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 500 " - + "ORDER BY mvIntColumn DESC LIMIT 1000 " + OPT_SCAN); + BaseOperator descOp = getOperator(OPT_INV + query); + BaseOperator descScanOp = getOperator(OPT_SCAN + query); assertEquals(extractOrderedIntValues(descOp.nextBlock().getDistinctTable().toResultTable()), extractOrderedIntValues(descScanOp.nextBlock().getDistinctTable().toResultTable())); } @@ -544,8 +526,8 @@ public void testMvColumnSelectiveFilter() { _activeSegment = _mvSegment; BaseOperator selectiveOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 3 " - + "ORDER BY mvIntColumn LIMIT 100 " + OPT_INV); + OPT_INV + "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn < 3 " + + "ORDER BY mvIntColumn LIMIT 100"); assertEquals(extractIntValues(selectiveOp.nextBlock().getDistinctTable()), new HashSet<>(Arrays.asList(0, 1, 2, 3))); } @@ -555,7 +537,7 @@ public void testMvColumnEmptyFilter() { _activeSegment = _mvSegment; BaseOperator emptyOp = getOperator( - "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn > 99999 LIMIT 1000 " + OPT_INV); + OPT_INV + "SELECT DISTINCT mvIntColumn FROM testTable WHERE svFilterColumn > 99999 LIMIT 1000"); assertEquals(emptyOp.nextBlock().getDistinctTable().size(), 0); } @@ -567,7 +549,7 @@ public void testSortedColumnPath() { // Should use sorted index path BaseOperator op = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn >= 0 LIMIT 1000 " + OPT + ")"); + OPT + "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn >= 0 LIMIT 1000"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedSortedIndex(op)); assertEquals(table.size(), SORTED_NUM_UNIQUE); @@ -579,7 +561,7 @@ public void testSortedColumnFilters() { // Selective filter: filterColumn < 500 → sorted values 0..4 BaseOperator selOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000 " + OPT + ")"); + OPT + "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000"); Set expected = new HashSet<>(); for (int i = 0; i < 5; i++) { expected.add(i); @@ -588,13 +570,13 @@ public void testSortedColumnFilters() { // Sparse filter: filterColumn=50 (value 0) OR filterColumn=150 (value 1) BaseOperator sparseOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn = 50 OR filterColumn = 150 " - + "LIMIT 1000 " + OPT + ")"); + OPT + "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn = 50 OR filterColumn = 150 " + + "LIMIT 1000"); assertEquals(extractIntValues(sparseOp.nextBlock().getDistinctTable()), Set.of(0, 1)); // Empty filter BaseOperator emptyOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn > 99999 LIMIT 1000 " + OPT + ")"); + OPT + "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn > 99999 LIMIT 1000"); DistinctTable emptyTable = emptyOp.nextBlock().getDistinctTable(); assertTrue(usedSortedIndex(emptyOp)); assertEquals(emptyTable.size(), 0); @@ -605,18 +587,17 @@ public void testSortedColumnLimit() { _activeSegment = _sortedSegment; BaseOperator limitOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn >= 0 LIMIT 10 " + OPT + ")"); + OPT + "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn >= 0 LIMIT 10"); assertEquals(limitOp.nextBlock().getDistinctTable().size(), 10); } @Test public void testSortedColumnMatchesScan() { _activeSegment = _sortedSegment; + String query = "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000"; - BaseOperator sortedOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000 " + OPT + ")"); - BaseOperator scanOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000"); + BaseOperator sortedOp = getOperator(OPT + query); + BaseOperator scanOp = getOperator(query); assertEquals(extractIntValues(sortedOp.nextBlock().getDistinctTable()), extractIntValues(scanOp.nextBlock().getDistinctTable())); } @@ -624,13 +605,11 @@ public void testSortedColumnMatchesScan() { @Test public void testSortedColumnOrderByDesc() { _activeSegment = _sortedSegment; + String query = "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 " + + "ORDER BY sortedColumn DESC LIMIT 1000"; - BaseOperator descOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 " - + "ORDER BY sortedColumn DESC LIMIT 1000 " + OPT + ")"); - BaseOperator descScanOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn < 500 " - + "ORDER BY sortedColumn DESC LIMIT 1000"); + BaseOperator descOp = getOperator(OPT + query); + BaseOperator descScanOp = getOperator(query); assertEquals(extractOrderedIntValues(descOp.nextBlock().getDistinctTable().toResultTable()), extractOrderedIntValues(descScanOp.nextBlock().getDistinctTable().toResultTable())); } @@ -638,14 +617,12 @@ public void testSortedColumnOrderByDesc() { @Test public void testSortedColumnOrderByDescWithLimit() { _activeSegment = _sortedSegment; - int limit = 5; - BaseOperator descLimitOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY sortedColumn DESC LIMIT " + limit + " " + OPT + ")"); - BaseOperator descLimitScanOp = getOperator( - "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY sortedColumn DESC LIMIT " + limit); + String query = "SELECT DISTINCT sortedColumn FROM testTable WHERE filterColumn >= 0 " + + "ORDER BY sortedColumn DESC LIMIT " + limit; + + BaseOperator descLimitOp = getOperator(OPT + query); + BaseOperator descLimitScanOp = getOperator(query); DistinctTable descLimitTable = descLimitOp.nextBlock().getDistinctTable(); ResultTable descLimitResultTable = descLimitTable.toResultTable(); assertEquals(extractOrderedIntValues(descLimitResultTable), @@ -656,17 +633,14 @@ public void testSortedColumnOrderByDescWithLimit() { @Test public void testMutableSegmentOrderByUsesInvertedIndex() { _activeSegment = _mutableSegment; - String bitmapCapableAllDocsFilter = "intColumn IN (0,1,2,3,4,5,6,7,8,9)"; + String query = "SELECT DISTINCT intColumn FROM testTable WHERE intColumn IN (0,1,2,3,4,5,6,7,8,9) " + + "ORDER BY intColumn DESC LIMIT 5"; - BaseOperator invertedOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE " + bitmapCapableAllDocsFilter + ' ' - + "ORDER BY intColumn DESC LIMIT 5 " + OPT_INV); + BaseOperator invertedOp = getOperator(OPT_INV + query); DistinctTable invertedTable = invertedOp.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(invertedOp)); - BaseOperator scanOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE " + bitmapCapableAllDocsFilter + ' ' - + "ORDER BY intColumn DESC LIMIT 5 " + OPT_SCAN); + BaseOperator scanOp = getOperator(OPT_SCAN + query); DistinctTable scanTable = scanOp.nextBlock().getDistinctTable(); assertFalse(usedInvertedIndex(scanOp)); @@ -683,7 +657,7 @@ public void testStringColumnWithFilter() { _activeSegment = _stringSegment; BaseOperator op = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000 " + OPT_INV); + OPT_INV + "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(op)); assertEquals(table.size(), STRING_NUM_UNIQUE); @@ -692,16 +666,14 @@ public void testStringColumnWithFilter() { @Test public void testStringColumnInvertedVsScan() { _activeSegment = _stringSegment; + String query = "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 200 " + + "ORDER BY stringColumn LIMIT 1000"; - BaseOperator invertedOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 200 " - + "ORDER BY stringColumn LIMIT 1000 " + OPT_INV); + BaseOperator invertedOp = getOperator(OPT_INV + query); DistinctTable invertedTable = invertedOp.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(invertedOp)); - BaseOperator scanOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 200 " - + "ORDER BY stringColumn LIMIT 1000 " + OPT_SCAN); + BaseOperator scanOp = getOperator(OPT_SCAN + query); assertEquals(extractOrderedStringValues(invertedTable.toResultTable()), extractOrderedStringValues(scanOp.nextBlock().getDistinctTable().toResultTable())); } @@ -709,13 +681,11 @@ public void testStringColumnInvertedVsScan() { @Test public void testStringColumnOrderByDesc() { _activeSegment = _stringSegment; + String query = "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn >= 0 " + + "ORDER BY stringColumn DESC LIMIT 1000"; - BaseOperator descOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY stringColumn DESC LIMIT 1000 " + OPT_INV); - BaseOperator descScanOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY stringColumn DESC LIMIT 1000 " + OPT_SCAN); + BaseOperator descOp = getOperator(OPT_INV + query); + BaseOperator descScanOp = getOperator(OPT_SCAN + query); assertEquals(extractOrderedStringValues(descOp.nextBlock().getDistinctTable().toResultTable()), extractOrderedStringValues(descScanOp.nextBlock().getDistinctTable().toResultTable())); } @@ -723,13 +693,11 @@ public void testStringColumnOrderByDesc() { @Test public void testStringColumnOrderByDescWithLimit() { _activeSegment = _stringSegment; + String query = "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn >= 0 " + + "ORDER BY stringColumn DESC LIMIT 5"; - BaseOperator descLimitOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY stringColumn DESC LIMIT 5 " + OPT_INV); - BaseOperator descLimitScanOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY stringColumn DESC LIMIT 5"); + BaseOperator descLimitOp = getOperator(OPT_INV + query); + BaseOperator descLimitScanOp = getOperator(query); DistinctTable descLimitStrTable = descLimitOp.nextBlock().getDistinctTable(); ResultTable descLimitResultTable = descLimitStrTable.toResultTable(); assertEquals(extractOrderedStringValues(descLimitResultTable), @@ -742,20 +710,18 @@ public void testStringColumnEmptyFilter() { _activeSegment = _stringSegment; BaseOperator emptyOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn > 99999 LIMIT 1000 " + OPT_INV); + OPT_INV + "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn > 99999 LIMIT 1000"); assertEquals(emptyOp.nextBlock().getDistinctTable().size(), 0); } @Test public void testStringColumnSelectiveFilter() { _activeSegment = _stringSegment; + String query = "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 100 " + + "ORDER BY stringColumn LIMIT 100"; - BaseOperator selectiveOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 100 " - + "ORDER BY stringColumn LIMIT 100 " + OPT_INV); - BaseOperator selectiveScanOp = getOperator( - "SELECT DISTINCT stringColumn FROM testTable WHERE filterColumn < 100 " - + "ORDER BY stringColumn LIMIT 100 " + OPT_SCAN); + BaseOperator selectiveOp = getOperator(OPT_INV + query); + BaseOperator selectiveScanOp = getOperator(OPT_SCAN + query); assertEquals(extractOrderedStringValues(selectiveOp.nextBlock().getDistinctTable().toResultTable()), extractOrderedStringValues(selectiveScanOp.nextBlock().getDistinctTable().toResultTable())); } @@ -767,8 +733,7 @@ public void testNullIncludedWithWideFilter() { _activeSegment = _nullSegment; BaseOperator op = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 LIMIT 1000 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 LIMIT 1000"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(op)); assertEquals(table.size(), NULL_NUM_UNIQUE + 1); @@ -781,8 +746,7 @@ public void testNullExcludedWithSelectiveFilter() { // filterColumn < 500 → docs 0-499, all non-null BaseOperator op = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn < 500 LIMIT 1000"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(op)); assertFalse(containsNull(table)); @@ -795,8 +759,7 @@ public void testNullWithPartialFilter() { // filterColumn >= 940 → docs 940-999 (10 non-null + 50 null) BaseOperator op = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 940 LIMIT 1000 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 940 LIMIT 1000"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(op)); assertTrue(containsNull(table)); @@ -808,9 +771,8 @@ public void testNullHandlingOrderBy() { _activeSegment = _nullSegment; BaseOperator orderOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY intColumn LIMIT 1000 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " + + "ORDER BY intColumn LIMIT 1000"); ResultTable resultTable = orderOp.nextBlock().getDistinctTable().toResultTable(); assertEquals(resultTable.getRows().size(), NULL_NUM_UNIQUE + 1); assertEquals(resultTable.getRows().get(0)[0], 0); @@ -822,9 +784,8 @@ public void testNullHandlingOrderByNullsFirstLimit() { _activeSegment = _nullSegment; BaseOperator limitOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY intColumn NULLS FIRST LIMIT 10 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " + + "ORDER BY intColumn NULLS FIRST LIMIT 10"); ResultTable resultTable = limitOp.nextBlock().getDistinctTable().toResultTable(); assertEquals(resultTable.getRows().size(), 10); assertNull(resultTable.getRows().get(0)[0]); @@ -836,9 +797,8 @@ public void testNullHandlingOrderByNullsLastLimit() { _activeSegment = _nullSegment; BaseOperator limitOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY intColumn LIMIT 10 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " + + "ORDER BY intColumn LIMIT 10"); ResultTable resultTable = limitOp.nextBlock().getDistinctTable().toResultTable(); assertEquals(resultTable.getRows().size(), 10); assertFalse(containsNull(resultTable)); @@ -851,9 +811,8 @@ public void testNullHandlingOrderByDescNullsLastLimit() { _activeSegment = _nullSegment; BaseOperator limitOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " - + "ORDER BY intColumn DESC NULLS LAST LIMIT 10 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 " + + "ORDER BY intColumn DESC NULLS LAST LIMIT 10"); ResultTable resultTable = limitOp.nextBlock().getDistinctTable().toResultTable(); assertEquals(resultTable.getRows().size(), 10); assertFalse(containsNull(resultTable)); @@ -864,18 +823,14 @@ public void testNullHandlingOrderByDescNullsLastLimit() { @Test public void testNullHandlingInvertedVsScan() { _activeSegment = _nullSegment; + String query = "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 940 " + + "ORDER BY intColumn LIMIT 1000"; - BaseOperator invertedOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 940 " - + "ORDER BY intColumn LIMIT 1000 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + BaseOperator invertedOp = getOperator(OPT_INV_NULLS + query); ResultTable invertedResultTable = invertedOp.nextBlock().getDistinctTable().toResultTable(); assertTrue(usedInvertedIndex(invertedOp)); - BaseOperator scanOp = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 940 " - + "ORDER BY intColumn LIMIT 1000 " - + OPT + ", invertedIndexDistinctCostRatio=100000, enableNullHandling=true)"); + BaseOperator scanOp = getOperator(OPT_SCAN_NULLS + query); ResultTable scanResultTable = scanOp.nextBlock().getDistinctTable().toResultTable(); assertFalse(usedInvertedIndex(scanOp)); @@ -888,8 +843,7 @@ public void testNullPreservedInBrokerResultWithoutOrderByLimit() { _activeSegment = _nullSegment; BaseOperator op = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 940 LIMIT 10 " - + OPT + ", invertedIndexDistinctCostRatio=1, enableNullHandling=true)"); + OPT_INV_NULLS + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 940 LIMIT 10"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(op)); @@ -911,11 +865,58 @@ public void testPlaceholderWithoutNullHandling() { _activeSegment = _nullSegment; BaseOperator op = getOperator( - "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 LIMIT 1000 " + OPT_INV); + OPT_INV + "SELECT DISTINCT intColumn FROM testTable WHERE filterColumn >= 0 LIMIT 1000"); DistinctTable table = op.nextBlock().getDistinctTable(); assertTrue(usedInvertedIndex(op)); assertFalse(containsNull(table)); // Without null handling: 50 real values + Integer.MIN_VALUE placeholder = 51 assertEquals(table.size(), NULL_NUM_UNIQUE + 1); } + + // ==================== Execution Statistics ==================== + + /// Inverted-index path emits `numDocsScanned` for matching docs and `numEntriesScannedPostFilter` for dictionary + /// entries examined; scan fallback emits the same `numDocsScanned` but reports it again as + /// `numEntriesScannedPostFilter` since the scan visits every matching doc. Both paths agree on total docs and on + /// the matching doc count for the same filter. + @Test + public void testExecutionStatistics() { + _activeSegment = _intSegment; + + // IN-list filter matches 10 values × 100 records = 1000 docs. + String inListQuery = + "SELECT DISTINCT intColumn FROM testTable WHERE intColumn IN (0,1,2,3,4,5,6,7,8,9) LIMIT 100"; + BaseOperator invertedOp = getOperator(OPT_INV + inListQuery); + invertedOp.nextBlock(); + assertTrue(usedInvertedIndex(invertedOp)); + ExecutionStatistics invertedStats = invertedOp.getExecutionStatistics(); + assertEquals(invertedStats.getNumDocsScanned(), 1000); + assertEquals(invertedStats.getNumTotalDocs(), INT_NUM_RECORDS); + assertTrue(invertedStats.getNumEntriesScannedPostFilter() > 0); + + BaseOperator scanOp = getOperator(OPT_SCAN + inListQuery); + scanOp.nextBlock(); + assertFalse(usedInvertedIndex(scanOp)); + ExecutionStatistics scanStats = scanOp.getExecutionStatistics(); + assertEquals(scanStats.getNumDocsScanned(), 1000); + assertEquals(scanStats.getNumTotalDocs(), INT_NUM_RECORDS); + assertEquals(scanStats.getNumEntriesScannedPostFilter(), 1000); + + // Empty filter: no docs scanned, no entries examined. + String emptyQuery = "SELECT DISTINCT intColumn FROM testTable WHERE intColumn = -1 LIMIT 100"; + BaseOperator emptyOp = getOperator(OPT_INV + emptyQuery); + emptyOp.nextBlock(); + ExecutionStatistics emptyStats = emptyOp.getExecutionStatistics(); + assertEquals(emptyStats.getNumDocsScanned(), 0); + assertEquals(emptyStats.getNumEntriesScannedPostFilter(), 0); + assertEquals(emptyStats.getNumTotalDocs(), INT_NUM_RECORDS); + + // Wide filter matching all docs. + String wideQuery = "SELECT DISTINCT intColumn FROM testTable WHERE intColumn >= 0 LIMIT 1000"; + BaseOperator wideOp = getOperator(OPT_INV + wideQuery); + wideOp.nextBlock(); + ExecutionStatistics wideStats = wideOp.getExecutionStatistics(); + assertEquals(wideStats.getNumDocsScanned(), INT_NUM_RECORDS); + assertEquals(wideStats.getNumTotalDocs(), INT_NUM_RECORDS); + } } diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/JsonIndexDistinctOperatorQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/JsonIndexDistinctOperatorQueriesTest.java new file mode 100644 index 000000000000..2fd35218c4b8 --- /dev/null +++ b/pinot-core/src/test/java/org/apache/pinot/queries/JsonIndexDistinctOperatorQueriesTest.java @@ -0,0 +1,447 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.queries; + +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.common.response.broker.BrokerResponseNative; +import org.apache.pinot.common.response.broker.ResultTable; +import org.apache.pinot.core.operator.BaseOperator; +import org.apache.pinot.core.operator.ExecutionStatistics; +import org.apache.pinot.core.operator.blocks.results.DistinctResultsBlock; +import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader; +import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; +import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig; +import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader; +import org.apache.pinot.segment.spi.IndexSegment; +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; +import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.config.table.JsonIndexConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; +import org.apache.pinot.spi.utils.JsonUtils; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.expectThrows; + + +/// Queries tests for [org.apache.pinot.core.operator.query.JsonIndexDistinctOperator] against the JSON index. +/// +/// Two segments cover the operator's behavior: +/// - **Full segment**: 500 docs, every doc has both `$.k1` and `$.k2`; `filterCol` is nullable in the last 100 docs. +/// Used for path parity, base-column-filter null handling, same/cross-path `JSON_MATCH` filters, the 5-arg +/// `jsonFilterExpression`, the `jsonIndexDistinctSkipMissingPath` query option, execution-statistics shape, and +/// the construction-time validation throw for invalid 4-arg defaults. +/// - **Sparse segment**: 20 docs, only the first 10 have `$.k1`. Used for the 4-arg-default, the same-path `IS NULL` +/// filter that triggers the default, the 3-arg "Illegal Json Path" throw, and the skip-option's suppression of +/// that throw. +/// +/// Composite-only behavior (selective `invertedIndexConfigs`, `jsonExtractScalar` fallback on non-indexed paths) +/// is covered separately in `ai.startree.integration.tests.JsonIndexDistinctOperatorCompositeSharedClusterTest`. +public class JsonIndexDistinctOperatorQueriesTest extends BaseQueriesTest { + private static final File INDEX_DIR = + new File(FileUtils.getTempDirectory(), "JsonIndexDistinctOperatorQueriesTest"); + private static final String RAW_TABLE_NAME = "testTable"; + private static final String JSON_COL = "jsonCol"; + private static final String FILTER_COL = "filterCol"; + + private static final int FULL_NUM_DOCS = 500; + private static final int FULL_NUM_DISTINCT_K1 = 50; + private static final int FULL_NUM_NON_NULL_FILTER = 400; + + private static final int SPARSE_NUM_DOCS = 20; + private static final int SPARSE_NUM_WITH_K1 = 10; + + private static final String OPT_USE_INDEX = "SET useIndexBasedDistinctOperator=true; "; + private static final String OPT_NULLS = "SET enableNullHandling=true; "; + private static final String OPT_USE_INDEX_NULLS = OPT_USE_INDEX + OPT_NULLS; + private static final String OPT_USE_INDEX_SKIP_MISSING_PATH = + OPT_USE_INDEX + "SET jsonIndexDistinctSkipMissingPath=true; "; + + private IndexSegment _fullSegment; + private IndexSegment _sparseSegment; + private IndexSegment _activeSegment; + + @Override + protected String getFilter() { + return ""; + } + + @Override + protected IndexSegment getIndexSegment() { + return _activeSegment; + } + + @Override + protected List getIndexSegments() { + return List.of(_activeSegment, _activeSegment); + } + + @BeforeClass + public void setUp() + throws Exception { + FileUtils.deleteDirectory(INDEX_DIR); + _fullSegment = buildFullSegment(); + _sparseSegment = buildSparseSegment(); + } + + @AfterClass + public void tearDown() { + if (_fullSegment != null) { + _fullSegment.destroy(); + } + if (_sparseSegment != null) { + _sparseSegment.destroy(); + } + FileUtils.deleteQuietly(INDEX_DIR); + } + + private IndexSegment buildFullSegment() + throws Exception { + Schema schema = new Schema.SchemaBuilder().setSchemaName(RAW_TABLE_NAME) + .addSingleValueDimension(JSON_COL, DataType.STRING) + .addSingleValueDimension(FILTER_COL, DataType.INT) + .build(); + + List records = new ArrayList<>(FULL_NUM_DOCS); + for (int i = 0; i < FULL_NUM_DOCS; i++) { + Map json = Map.of( + "k1", "value-k1-" + (i % FULL_NUM_DISTINCT_K1), + "k2", "value-k2-" + i + ); + GenericRow record = new GenericRow(); + record.putValue(JSON_COL, JsonUtils.objectToString(json)); + record.putValue(FILTER_COL, i < FULL_NUM_NON_NULL_FILTER ? i : null); + records.add(record); + } + + TableConfig tableConfig = createTableConfig(true); + return buildSegment("fullSegment", schema, tableConfig, records, true); + } + + private IndexSegment buildSparseSegment() + throws Exception { + Schema schema = new Schema.SchemaBuilder().setSchemaName(RAW_TABLE_NAME) + .addSingleValueDimension(JSON_COL, DataType.STRING) + .build(); + + List records = new ArrayList<>(SPARSE_NUM_DOCS); + for (int i = 0; i < SPARSE_NUM_DOCS; i++) { + Map json = new HashMap<>(); + if (i < SPARSE_NUM_WITH_K1) { + json.put("k1", "k1-" + i); + } + json.put("k2", "k2-" + i); + GenericRow record = new GenericRow(); + record.putValue(JSON_COL, JsonUtils.objectToString(json)); + records.add(record); + } + + TableConfig tableConfig = createTableConfig(false); + return buildSegment("sparseSegment", schema, tableConfig, records, false); + } + + private TableConfig createTableConfig(boolean withFilterCol) { + ObjectNode indexes = JsonUtils.newObjectNode(); + indexes.set("json", new JsonIndexConfig().toJsonNode()); + TableConfigBuilder builder = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .addFieldConfig(new FieldConfig.Builder(JSON_COL) + .withEncodingType(FieldConfig.EncodingType.RAW) + .withIndexes(indexes) + .build()); + if (withFilterCol) { + builder.setNullHandlingEnabled(true); + } + return builder.build(); + } + + private IndexSegment buildSegment(String segmentName, Schema schema, TableConfig tableConfig, + List records, boolean defaultNullHandling) + throws Exception { + File segmentDir = new File(INDEX_DIR, segmentName + "_dir"); + SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema); + config.setTableName(RAW_TABLE_NAME); + config.setSegmentName(segmentName); + config.setOutDir(segmentDir.getAbsolutePath()); + config.setDefaultNullHandlingEnabled(defaultNullHandling); + + SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); + driver.init(config, new GenericRowRecordReader(records)); + driver.build(); + + return ImmutableSegmentLoader.load(new File(segmentDir, segmentName), new IndexLoadingConfig(tableConfig, schema)); + } + + /// With the OSS JSON index `$.k1` is indexed, so `useIndexBasedDistinctOperator=true` must route through + /// [JsonIndexDistinctOperator] and produce the same set of distinct values as the scan-based baseline. + @Test + public void testIndexedPathParity() { + _activeSegment = _fullSegment; + String selectExpr = "jsonExtractIndex(jsonCol, '$.k1', 'STRING')"; + String baselineQuery = "SELECT DISTINCT " + selectExpr + " FROM testTable ORDER BY " + selectExpr + " LIMIT 10000"; + String optimizedQuery = OPT_USE_INDEX + baselineQuery; + + BaseOperator optimizedOp = getOperator(optimizedQuery); + assertTrue(optimizedOp.toExplainString().contains("DISTINCT_JSON_INDEX")); + + BrokerResponseNative baseline = getBrokerResponse(baselineQuery); + BrokerResponseNative optimized = getBrokerResponse(optimizedQuery); + assertEquals(extractStringValues(optimized.getResultTable()), extractStringValues(baseline.getResultTable())); + assertEquals(optimized.getResultTable().getRows().size(), FULL_NUM_DISTINCT_K1); + } + + /// A nullable filter column drives the doc-id set into [JsonIndexDistinctOperator]. With null handling disabled, + /// `filterCol < N` still excludes nulls naturally; with `enableNullHandling=true`, `filterCol IS NOT NULL` must + /// produce the same distinct values as the scan-based baseline. Regression coverage for the prior bug where the + /// operator did not honor null exclusion delivered through a base-column filter. + @Test + public void testNullHandlingOnSeparateFilterColumn() { + _activeSegment = _fullSegment; + String selectExpr = "jsonExtractIndex(jsonCol, '$.k1', 'STRING')"; + + String rangeBaselineQuery = + "SELECT DISTINCT " + selectExpr + " FROM testTable WHERE filterCol < " + FULL_NUM_NON_NULL_FILTER + " " + + "ORDER BY " + selectExpr + " LIMIT 10000"; + String rangeOptimizedQuery = OPT_USE_INDEX + rangeBaselineQuery; + BaseOperator rangeOptimizedOp = getOperator(rangeOptimizedQuery); + assertTrue(rangeOptimizedOp.toExplainString().contains("DISTINCT_JSON_INDEX")); + BrokerResponseNative rangeBaseline = getBrokerResponse(rangeBaselineQuery); + BrokerResponseNative rangeOptimized = getBrokerResponse(rangeOptimizedQuery); + assertEquals(extractStringValues(rangeOptimized.getResultTable()), + extractStringValues(rangeBaseline.getResultTable())); + assertFalse(containsNull(rangeOptimized.getResultTable())); + + String isNotNullBody = + "SELECT DISTINCT " + selectExpr + " FROM testTable WHERE filterCol IS NOT NULL " + + "ORDER BY " + selectExpr + " LIMIT 10000"; + String isNotNullBaselineQuery = OPT_NULLS + isNotNullBody; + String isNotNullOptimizedQuery = OPT_USE_INDEX_NULLS + isNotNullBody; + BaseOperator isNotNullOptimizedOp = getOperator(isNotNullOptimizedQuery); + assertTrue(isNotNullOptimizedOp.toExplainString().contains("DISTINCT_JSON_INDEX")); + BrokerResponseNative isNotNullBaseline = getBrokerResponse(isNotNullBaselineQuery); + BrokerResponseNative isNotNullOptimized = getBrokerResponse(isNotNullOptimizedQuery); + assertEquals(extractStringValues(isNotNullOptimized.getResultTable()), + extractStringValues(isNotNullBaseline.getResultTable())); + assertFalse(containsNull(isNotNullOptimized.getResultTable())); + } + + /// Same-path `JSON_MATCH` on the indexed column means the filter resolves entirely inside the JSON index, so the + /// distinct set must contain only the values that satisfy the predicate. With `value-k1-0` selected, the result + /// is a single distinct value. + @Test + public void testSamePathJsonMatchFilter() { + _activeSegment = _fullSegment; + String query = OPT_USE_INDEX + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING') FROM testTable " + + "WHERE JSON_MATCH(jsonCol, '\"$.k1\" = ''value-k1-0''') LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + assertEquals(extractStringValues(getBrokerResponse(query).getResultTable()), Set.of("value-k1-0")); + } + + /// Cross-path `JSON_MATCH` (filter on `$.k2`, select distinct of `$.k1`) intersects the per-value doc ids from the + /// `$.k1` JSON-index lookup with the doc set produced by the `WHERE`-clause filter on `$.k2`, returning only the + /// `$.k1` values for docs whose `$.k2` matches. + @Test + public void testCrossPathJsonMatchFilter() { + _activeSegment = _fullSegment; + // `$.k2` = `value-k2-7` matches a single doc; that doc's `$.k1` is `value-k1-(7 % 50)` = `value-k1-7`. + String query = OPT_USE_INDEX + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING') FROM testTable " + + "WHERE JSON_MATCH(jsonCol, '\"$.k2\" = ''value-k2-7''') LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + assertEquals(extractStringValues(getBrokerResponse(query).getResultTable()), Set.of("value-k1-7")); + } + + /// A 4-arg `jsonExtractIndex` whose default literal cannot be parsed into the requested type still routes through + /// the JSON-index operator (planner-time `canUseJsonIndexDistinct` only checks the function name); the operator's + /// constructor surfaces the validation failure as `IllegalArgumentException`. + @Test + public void testInvalidDefaultArgThrowsAtConstruction() { + _activeSegment = _fullSegment; + String query = OPT_USE_INDEX + + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'INT', 'abc') FROM testTable LIMIT 100"; + IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> getOperator(query)); + assertTrue(exception.getMessage().contains("Default value")); + } + + /// With a 4-arg `jsonExtractIndex(..., defaultValue)` and docs that don't have the path, the default value must be + /// added once to the distinct set. The sparse segment has 10 docs with `$.k1` and 10 without, so the result is the + /// 10 distinct `$.k1` values plus the literal `missing`. + @Test + public void testFourArgDefaultForDocsWithoutPath() { + _activeSegment = _sparseSegment; + String query = OPT_USE_INDEX + + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING', 'missing') FROM testTable LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + + Set expected = new HashSet<>(); + for (int i = 0; i < SPARSE_NUM_WITH_K1; i++) { + expected.add("k1-" + i); + } + expected.add("missing"); + assertEquals(extractStringValues(getBrokerResponse(query).getResultTable()), expected); + } + + /// `"$.k1" IS NULL` selects only docs missing the path. None of the values returned by the JSON-index lookup + /// intersect the filtered doc set, so `handleMissingDocs` adds the 4-arg default for the unmatched docs. Result + /// is the default alone. + @Test + public void testSamePathIsNullFilterWithDefault() { + _activeSegment = _sparseSegment; + String query = OPT_USE_INDEX + + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING', 'missing') FROM testTable " + + "WHERE JSON_MATCH(jsonCol, '\"$.k1\" IS NULL') LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + assertEquals(extractStringValues(getBrokerResponse(query).getResultTable()), Set.of("missing")); + } + + /// 3-arg `jsonExtractIndex` (no default) over a segment where some docs miss the path must throw + /// `Illegal Json Path` once `handleMissingDocs` is reached. + @Test + public void testMissingPathThrowsWithoutDefault() { + _activeSegment = _sparseSegment; + String query = OPT_USE_INDEX + + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING') FROM testTable LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + RuntimeException exception = expectThrows(RuntimeException.class, op::nextBlock); + assertTrue(exception.getMessage().contains("Illegal Json Path")); + } + + /// `numDocsScanned` reports the count of matching docs (either the filter bitmap's cardinality, or `_totalDocs` when + /// the filter is MatchAll). `numEntriesScannedPostFilter` is the count of distinct JSON-index values examined. + /// `numEntriesScannedInFilter` is reported by the underlying filter operator (0 for MatchAll, positive when the + /// filter materializes a bitmap). + @Test + public void testExecutionStatistics() { + _activeSegment = _fullSegment; + + // Unfiltered: filter is MatchAll → numDocsScanned == _totalDocs, numEntriesScannedInFilter == 0, every distinct + // k1 value is examined post-filter. + String unfilteredQuery = + OPT_USE_INDEX + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING') FROM testTable LIMIT 10000"; + BaseOperator unfilteredOp = getOperator(unfilteredQuery); + unfilteredOp.nextBlock(); + ExecutionStatistics unfilteredStats = unfilteredOp.getExecutionStatistics(); + assertEquals(unfilteredStats.getNumDocsScanned(), FULL_NUM_DOCS); + assertEquals(unfilteredStats.getNumEntriesScannedInFilter(), 0); + assertEquals(unfilteredStats.getNumEntriesScannedPostFilter(), FULL_NUM_DISTINCT_K1); + assertEquals(unfilteredStats.getNumTotalDocs(), FULL_NUM_DOCS); + + // Base-column filter: numDocsScanned == cardinality of the filter bitmap. numEntriesScannedInFilter > 0 since the + // scan-based filter materializes its bitmap by visiting docs. `filterCol >= 0` excludes the Integer.MIN_VALUE + // null-placeholder docs (since null handling is not enabled here), leaving exactly the FULL_NUM_NON_NULL_FILTER + // non-null docs. + String filteredQuery = OPT_USE_INDEX + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING') FROM testTable " + + "WHERE filterCol >= 0 LIMIT 10000"; + BaseOperator filteredOp = getOperator(filteredQuery); + filteredOp.nextBlock(); + ExecutionStatistics filteredStats = filteredOp.getExecutionStatistics(); + assertEquals(filteredStats.getNumDocsScanned(), FULL_NUM_NON_NULL_FILTER); + assertEquals(filteredStats.getNumEntriesScannedInFilter(), FULL_NUM_DOCS); + assertEquals(filteredStats.getNumEntriesScannedPostFilter(), FULL_NUM_DISTINCT_K1); + assertEquals(filteredStats.getNumTotalDocs(), FULL_NUM_DOCS); + } + + /// 5-arg form pushes the `jsonFilterExpression` literal directly into `getMatchingFlattenedDocsMap`, so the JSON + /// index returns only entries whose values satisfy the filter. Any doc whose value does not satisfy the filter is + /// seen by the distinct operator as missing-path; with a 4-arg default present, the default is added to the + /// distinct set for those docs. Matches `JsonExtractIndexTransformFunction`'s per-doc behavior, where + /// `getValuesSV` returns null for docs outside the filtered map and the loop substitutes the default. + @Test + public void testFiveArgFilterJsonExpression() { + _activeSegment = _fullSegment; + // 5-arg filter narrows the index to value-k1-3 (10 docs). The remaining 490 docs see their $.k1 as missing under + // this filter and pick up the 4-arg default 'missing'. Result is the union of both. + String query = OPT_USE_INDEX + "SELECT DISTINCT " + + "jsonExtractIndex(jsonCol, '$.k1', 'STRING', 'missing', '\"$.k1\" = ''value-k1-3''') " + + "FROM testTable LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + assertEquals(extractStringValues(getBrokerResponse(query).getResultTable()), + Set.of("value-k1-3", "missing")); + } + + /// `jsonIndexDistinctSkipMissingPath=true` disables `handleMissingDocs` entirely. Even when docs are + /// "missing" from the index (here, all docs outside the 5-arg filter), the 4-arg default is NOT added, no null is + /// added under nullHandling, and the 3-arg "Illegal Json Path" throw is suppressed. The distinct set is exactly + /// the values the JSON index returned. + @Test + public void testSkipMissingPath() { + _activeSegment = _fullSegment; + String query = OPT_USE_INDEX_SKIP_MISSING_PATH + + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING', 'missing', '\"$.k1\" = ''value-k1-3''') " + + "FROM testTable LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + // No 'missing' even though the 4-arg default is set and 490 docs are "missing" under the 5-arg filter. + assertEquals(extractStringValues(getBrokerResponse(query).getResultTable()), Set.of("value-k1-3")); + } + + /// With the skip option, a 3-arg call over a sparse segment (docs without `$.k1`) no longer throws — it just + /// returns the values it did find in the index. + @Test + public void testSkipMissingPathSuppressesThrow() { + _activeSegment = _sparseSegment; + String query = OPT_USE_INDEX_SKIP_MISSING_PATH + + "SELECT DISTINCT jsonExtractIndex(jsonCol, '$.k1', 'STRING') FROM testTable LIMIT 100"; + BaseOperator op = getOperator(query); + assertTrue(op.toExplainString().contains("DISTINCT_JSON_INDEX")); + Set expected = new HashSet<>(); + for (int i = 0; i < SPARSE_NUM_WITH_K1; i++) { + expected.add("k1-" + i); + } + assertEquals(extractStringValues(getBrokerResponse(query).getResultTable()), expected); + } + + private static Set extractStringValues(ResultTable resultTable) { + Set values = new HashSet<>(); + for (Object[] row : resultTable.getRows()) { + values.add(row[0] == null ? null : (String) row[0]); + } + return values; + } + + private static boolean containsNull(ResultTable resultTable) { + for (Object[] row : resultTable.getRows()) { + if (row[0] == null) { + return true; + } + } + return false; + } +} diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/JsonPathTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/JsonPathTest.java index 520f55126ab6..bb90cf7313da 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/JsonPathTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/JsonPathTest.java @@ -24,8 +24,6 @@ import com.jayway.jsonpath.spi.cache.CacheProvider; import java.io.File; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -36,29 +34,40 @@ import org.apache.pinot.spi.config.table.TableType; import org.apache.pinot.spi.config.table.ingestion.IngestionConfig; import org.apache.pinot.spi.config.table.ingestion.TransformConfig; -import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.FieldSpec.DataType; import org.apache.pinot.spi.data.Schema; import org.apache.pinot.spi.exception.QueryErrorCode; +import org.apache.pinot.spi.utils.CommonConstants.Broker.Request.QueryOptionKey; import org.apache.pinot.spi.utils.JsonUtils; import org.apache.pinot.spi.utils.builder.TableConfigBuilder; -import org.testng.Assert; import org.testng.annotations.Test; -import static org.apache.pinot.spi.utils.CommonConstants.Broker.Request.QueryOptionKey.USE_INDEX_BASED_DISTINCT_OPERATOR; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; @Test(suiteName = "CustomClusterIntegrationTest") public class JsonPathTest extends CustomDataQueryClusterIntegrationTest { - protected static final String DEFAULT_TABLE_NAME = "JsonPathTest"; protected static final int NUM_DOCS_PER_SEGMENT = 1000; + // Number of distinct values for myMapStr.$.k1 across the segment. Setting this lower than NUM_DOCS_PER_SEGMENT + // forces value repetition, so the JsonIndexDistinct path (which enumerates dictionary values once) and the scan + // path (which visits every doc) return the same result set but follow visibly different code paths. + private static final int NUM_DISTINCT_K1 = 100; private static final String MY_MAP_STR_FIELD_NAME = "myMapStr"; private static final String MY_MAP_STR_K1_FIELD_NAME = "myMapStr_k1"; private static final String MY_MAP_STR_K2_FIELD_NAME = "myMapStr_k2"; private static final String COMPLEX_MAP_STR_FIELD_NAME = "complexMapStr"; private static final String COMPLEX_MAP_STR_K3_FIELD_NAME = "complexMapStr_k3"; + // Query-option strings passed to postQueryWithOptions. + private static final String OPT_USE_INDEX = QueryOptionKey.USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"; + private static final String OPT_USE_INDEX_SKIP_MISSING_PATH = + OPT_USE_INDEX + ";" + QueryOptionKey.JSON_INDEX_DISTINCT_SKIP_MISSING_PATH + "=true"; + protected final List _sortedSequenceIds = new ArrayList<>(NUM_DOCS_PER_SEGMENT); @Override @@ -67,32 +76,36 @@ protected long getCountStarResult() { } @Override - public Schema createSchema() { - return new Schema.SchemaBuilder().setSchemaName(getTableName()) - .addSingleValueDimension("myMap", FieldSpec.DataType.STRING) - .addSingleValueDimension(MY_MAP_STR_FIELD_NAME, FieldSpec.DataType.STRING) - .addSingleValueDimension(MY_MAP_STR_K1_FIELD_NAME, FieldSpec.DataType.STRING) - .addSingleValueDimension(MY_MAP_STR_K2_FIELD_NAME, FieldSpec.DataType.STRING) - .addSingleValueDimension(COMPLEX_MAP_STR_FIELD_NAME, FieldSpec.DataType.STRING) - .addMultiValueDimension(COMPLEX_MAP_STR_K3_FIELD_NAME, FieldSpec.DataType.STRING).build(); + public String getTableName() { + return DEFAULT_TABLE_NAME; } @Override - public String getTableName() { - return DEFAULT_TABLE_NAME; + public Schema createSchema() { + return new Schema.SchemaBuilder() + .setSchemaName(getTableName()) + .addSingleValueDimension("myMap", DataType.STRING) + .addSingleValueDimension(MY_MAP_STR_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(MY_MAP_STR_K1_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(MY_MAP_STR_K2_FIELD_NAME, DataType.STRING) + .addSingleValueDimension(COMPLEX_MAP_STR_FIELD_NAME, DataType.STRING) + .addMultiValueDimension(COMPLEX_MAP_STR_K3_FIELD_NAME, DataType.STRING) + .build(); } @Override public TableConfig createOfflineTableConfig() { - List transformConfigs = Arrays.asList( + List transformConfigs = List.of( new TransformConfig(MY_MAP_STR_K1_FIELD_NAME, "jsonPathString(" + MY_MAP_STR_FIELD_NAME + ", '$.k1')"), new TransformConfig(MY_MAP_STR_K2_FIELD_NAME, "jsonPathString(" + MY_MAP_STR_FIELD_NAME + ", '$.k2')"), - new TransformConfig(COMPLEX_MAP_STR_K3_FIELD_NAME, - "jsonPathArray(" + COMPLEX_MAP_STR_FIELD_NAME + ", '$.k3')")); + new TransformConfig(COMPLEX_MAP_STR_K3_FIELD_NAME, "jsonPathArray(" + COMPLEX_MAP_STR_FIELD_NAME + ", '$.k3')") + ); IngestionConfig ingestionConfig = new IngestionConfig(); ingestionConfig.setTransformConfigs(transformConfigs); - return new TableConfigBuilder(TableType.OFFLINE).setTableName(getTableName()).setIngestionConfig(ingestionConfig) - .setJsonIndexColumns(Collections.singletonList(MY_MAP_STR_FIELD_NAME)) + return new TableConfigBuilder(TableType.OFFLINE) + .setTableName(getTableName()) + .setIngestionConfig(ingestionConfig) + .setJsonIndexColumns(List.of(MY_MAP_STR_FIELD_NAME)) .build(); } @@ -100,17 +113,18 @@ public TableConfig createOfflineTableConfig() { public List createAvroFiles() throws Exception { org.apache.avro.Schema avroSchema = org.apache.avro.Schema.createRecord("myRecord", null, null, false); - List fields = - Arrays.asList(new org.apache.avro.Schema.Field(MY_MAP_STR_FIELD_NAME, org.apache.avro.Schema.create( - org.apache.avro.Schema.Type.STRING), null, null), - new org.apache.avro.Schema.Field(COMPLEX_MAP_STR_FIELD_NAME, org.apache.avro.Schema.create( - org.apache.avro.Schema.Type.STRING), null, null)); + List fields = List.of( + new org.apache.avro.Schema.Field(MY_MAP_STR_FIELD_NAME, + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), null, null), + new org.apache.avro.Schema.Field(COMPLEX_MAP_STR_FIELD_NAME, + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), null, null) + ); avroSchema.setFields(fields); try (AvroFilesAndWriters avroFilesAndWriters = createAvroFilesAndWriters(avroSchema)) { for (int i = 0; i < NUM_DOCS_PER_SEGMENT; i++) { Map map = new HashMap<>(); - map.put("k1", "value-k1-" + i); + map.put("k1", "value-k1-" + (i % NUM_DISTINCT_K1)); map.put("k2", "value-k2-" + i); GenericData.Record record = new GenericData.Record(avroSchema); record.put(MY_MAP_STR_FIELD_NAME, JsonUtils.objectToString(map)); @@ -118,17 +132,20 @@ public List createAvroFiles() Map complexMap = new HashMap<>(); complexMap.put("k1", "value-k1-" + i); complexMap.put("k2", "value-k2-" + i); - complexMap.put("k3", Arrays.asList("value-k3-0-" + i, "value-k3-1-" + i, "value-k3-2-" + i)); - complexMap.put("k4", - Map.of("k4-k1", "value-k4-k1-" + i, "k4-k2", "value-k4-k2-" + i, "k4-k3", "value-k4-k3-" + i, - "met", i)); + complexMap.put("k3", List.of("value-k3-0-" + i, "value-k3-1-" + i, "value-k3-2-" + i)); + complexMap.put("k4", Map.of( + "k4-k1", "value-k4-k1-" + i, + "k4-k2", "value-k4-k2-" + i, + "k4-k3", "value-k4-k3-" + i, + "met", i) + ); record.put(COMPLEX_MAP_STR_FIELD_NAME, JsonUtils.objectToString(complexMap)); for (DataFileWriter writer : avroFilesAndWriters.getWriters()) { writer.append(record); } _sortedSequenceIds.add(String.valueOf(i)); } - Collections.sort(_sortedSequenceIds); + _sortedSequenceIds.sort(null); return avroFilesAndWriters.getAvroFiles(); } } @@ -139,49 +156,47 @@ public void testQueries(boolean useMultiStageQueryEngine) setUseMultiStageQueryEngine(useMultiStageQueryEngine); //Selection Query - String query = "Select myMapStr from " + getTableName(); + String query = "SELECT myMapStr FROM " + getTableName(); JsonNode pinotResponse = postQuery(query); ArrayNode rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertNotNull(rows); - Assert.assertFalse(rows.isEmpty()); + assertNotNull(rows); + assertFalse(rows.isEmpty()); for (int i = 0; i < rows.size(); i++) { String value = rows.get(i).get(0).textValue(); - Assert.assertTrue(value.indexOf("-k1-") > 0); + assertTrue(value.indexOf("-k1-") > 0); } //Filter Query - query = "Select jsonExtractScalar(myMapStr,'$.k1','STRING') from " + getTableName() - + " where jsonExtractScalar(myMapStr,'$.k1','STRING') = 'value-k1-0'"; + String expr = "jsonExtractScalar(myMapStr,'$.k1','STRING')"; + query = "SELECT " + expr + " FROM " + getTableName() + " WHERE " + expr + " = 'value-k1-0'"; pinotResponse = postQuery(query); rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertNotNull(rows); - Assert.assertFalse(rows.isEmpty()); + assertNotNull(rows); + assertFalse(rows.isEmpty()); for (int i = 0; i < rows.size(); i++) { String value = rows.get(i).get(0).textValue(); - Assert.assertEquals(value, "value-k1-0"); + assertEquals(value, "value-k1-0"); } //selection order by - query = "Select jsonExtractScalar(myMapStr,'$.k1','STRING') from " + getTableName() - + " order by jsonExtractScalar(myMapStr,'$.k1','STRING')"; + query = "SELECT " + expr + " FROM " + getTableName() + " ORDER BY " + expr; pinotResponse = postQuery(query); rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertNotNull(rows); - Assert.assertFalse(rows.isEmpty()); + assertNotNull(rows); + assertFalse(rows.isEmpty()); for (int i = 0; i < rows.size(); i++) { String value = rows.get(i).get(0).textValue(); - Assert.assertTrue(value.indexOf("-k1-") > 0); + assertTrue(value.indexOf("-k1-") > 0); } //Group By Query - query = "Select jsonExtractScalar(myMapStr,'$.k1','STRING'), count(*) from " + getTableName() - + " group by jsonExtractScalar(myMapStr,'$.k1','STRING')"; + query = "SELECT " + expr + ", count(*) FROM " + getTableName() + " GROUP BY " + expr; pinotResponse = postQuery(query); - Assert.assertNotNull(pinotResponse.get("resultTable")); + assertNotNull(pinotResponse.get("resultTable")); rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); for (int i = 0; i < rows.size(); i++) { String value = rows.get(i).get(0).textValue(); - Assert.assertTrue(value.indexOf("-k1-") > 0); + assertTrue(value.indexOf("-k1-") > 0); } } @@ -190,71 +205,71 @@ public void testComplexQueries(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); //Selection Query - String query = "Select complexMapStr from " + getTableName(); + String query = "SELECT complexMapStr FROM " + getTableName(); JsonNode pinotResponse = postQuery(query); ArrayNode rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertNotNull(rows); - Assert.assertFalse(rows.isEmpty()); + assertNotNull(rows); + assertFalse(rows.isEmpty()); for (int i = 0; i < rows.size(); i++) { String value = rows.get(i).get(0).textValue(); Map results = JsonUtils.stringToObject(value, Map.class); - Assert.assertTrue(value.indexOf("-k1-") > 0); - Assert.assertEquals(results.get("k1"), "value-k1-" + i % NUM_DOCS_PER_SEGMENT); - Assert.assertEquals(results.get("k2"), "value-k2-" + i % NUM_DOCS_PER_SEGMENT); - final List k3 = (List) results.get("k3"); - Assert.assertEquals(k3.size(), 3); - Assert.assertEquals(k3.get(0), "value-k3-0-" + i % NUM_DOCS_PER_SEGMENT); - Assert.assertEquals(k3.get(1), "value-k3-1-" + i % NUM_DOCS_PER_SEGMENT); - Assert.assertEquals(k3.get(2), "value-k3-2-" + i % NUM_DOCS_PER_SEGMENT); - final Map k4 = (Map) results.get("k4"); - Assert.assertEquals(k4.size(), 4); - Assert.assertEquals(k4.get("k4-k1"), "value-k4-k1-" + i % NUM_DOCS_PER_SEGMENT); - Assert.assertEquals(k4.get("k4-k2"), "value-k4-k2-" + i % NUM_DOCS_PER_SEGMENT); - Assert.assertEquals(k4.get("k4-k3"), "value-k4-k3-" + i % NUM_DOCS_PER_SEGMENT); - Assert.assertEquals(Double.parseDouble(k4.get("met").toString()), i % NUM_DOCS_PER_SEGMENT); + assertTrue(value.indexOf("-k1-") > 0); + assertEquals(results.get("k1"), "value-k1-" + i % NUM_DOCS_PER_SEGMENT); + assertEquals(results.get("k2"), "value-k2-" + i % NUM_DOCS_PER_SEGMENT); + List k3 = (List) results.get("k3"); + assertEquals(k3.size(), 3); + assertEquals(k3.get(0), "value-k3-0-" + i % NUM_DOCS_PER_SEGMENT); + assertEquals(k3.get(1), "value-k3-1-" + i % NUM_DOCS_PER_SEGMENT); + assertEquals(k3.get(2), "value-k3-2-" + i % NUM_DOCS_PER_SEGMENT); + Map k4 = (Map) results.get("k4"); + assertEquals(k4.size(), 4); + assertEquals(k4.get("k4-k1"), "value-k4-k1-" + i % NUM_DOCS_PER_SEGMENT); + assertEquals(k4.get("k4-k2"), "value-k4-k2-" + i % NUM_DOCS_PER_SEGMENT); + assertEquals(k4.get("k4-k3"), "value-k4-k3-" + i % NUM_DOCS_PER_SEGMENT); + assertEquals(Double.parseDouble(k4.get("met").toString()), i % NUM_DOCS_PER_SEGMENT); } //Filter Query - query = "Select jsonExtractScalar(complexMapStr,'$.k4','STRING') from " + getTableName() - + " where jsonExtractScalar(complexMapStr,'$.k4.k4-k1','STRING') = 'value-k4-k1-0'"; + query = "SELECT jsonExtractScalar(complexMapStr,'$.k4','STRING') FROM " + getTableName() + + " WHERE jsonExtractScalar(complexMapStr,'$.k4.k4-k1','STRING') = 'value-k4-k1-0'"; pinotResponse = postQuery(query); rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertNotNull(rows); - Assert.assertEquals(rows.size(), getNumAvroFiles()); + assertNotNull(rows); + assertEquals(rows.size(), getNumAvroFiles()); for (int i = 0; i < rows.size(); i++) { String value = rows.get(i).get(0).textValue(); Map k4 = JsonUtils.stringToObject(value, Map.class); - Assert.assertEquals(k4.size(), 4); - Assert.assertEquals(k4.get("k4-k1"), "value-k4-k1-0"); - Assert.assertEquals(k4.get("k4-k2"), "value-k4-k2-0"); - Assert.assertEquals(k4.get("k4-k3"), "value-k4-k3-0"); - Assert.assertEquals(Double.parseDouble(k4.get("met").toString()), 0.0); + assertEquals(k4.size(), 4); + assertEquals(k4.get("k4-k1"), "value-k4-k1-0"); + assertEquals(k4.get("k4-k2"), "value-k4-k2-0"); + assertEquals(k4.get("k4-k3"), "value-k4-k3-0"); + assertEquals(Double.parseDouble(k4.get("met").toString()), 0.0); } //selection order by - query = "Select complexMapStr from " + getTableName() - + " order by jsonExtractScalar(complexMapStr,'$.k4.k4-k1','STRING') DESC LIMIT " + NUM_DOCS_PER_SEGMENT; + query = "SELECT complexMapStr FROM " + getTableName() + + " ORDER BY jsonExtractScalar(complexMapStr,'$.k4.k4-k1','STRING') DESC LIMIT " + NUM_DOCS_PER_SEGMENT; pinotResponse = postQuery(query); rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertNotNull(rows); - Assert.assertFalse(rows.isEmpty()); + assertNotNull(rows); + assertFalse(rows.isEmpty()); for (int i = 0; i < rows.size(); i++) { String value = rows.get(i).get(0).textValue(); - Assert.assertTrue(value.indexOf("-k1-") > 0); + assertTrue(value.indexOf("-k1-") > 0); Map results = JsonUtils.stringToObject(value, Map.class); String seqId = _sortedSequenceIds.get(NUM_DOCS_PER_SEGMENT - 1 - i / getNumAvroFiles()); - Assert.assertEquals(results.get("k1"), "value-k1-" + seqId); - Assert.assertEquals(results.get("k2"), "value-k2-" + seqId); - final List k3 = (List) results.get("k3"); - Assert.assertEquals(k3.get(0), "value-k3-0-" + seqId); - Assert.assertEquals(k3.get(1), "value-k3-1-" + seqId); - Assert.assertEquals(k3.get(2), "value-k3-2-" + seqId); - final Map k4 = (Map) results.get("k4"); - Assert.assertEquals(k4.get("k4-k1"), "value-k4-k1-" + seqId); - Assert.assertEquals(k4.get("k4-k2"), "value-k4-k2-" + seqId); - Assert.assertEquals(k4.get("k4-k3"), "value-k4-k3-" + seqId); - Assert.assertEquals(Double.parseDouble(k4.get("met").toString()), Double.parseDouble(seqId)); + assertEquals(results.get("k1"), "value-k1-" + seqId); + assertEquals(results.get("k2"), "value-k2-" + seqId); + List k3 = (List) results.get("k3"); + assertEquals(k3.get(0), "value-k3-0-" + seqId); + assertEquals(k3.get(1), "value-k3-1-" + seqId); + assertEquals(k3.get(2), "value-k3-2-" + seqId); + Map k4 = (Map) results.get("k4"); + assertEquals(k4.get("k4-k1"), "value-k4-k1-" + seqId); + assertEquals(k4.get("k4-k2"), "value-k4-k2-" + seqId); + assertEquals(k4.get("k4-k3"), "value-k4-k3-" + seqId); + assertEquals(Double.parseDouble(k4.get("met").toString()), Double.parseDouble(seqId)); } } @@ -264,18 +279,18 @@ public void testComplexGroupByQueryV1(boolean useMultiStageQueryEngine) setUseMultiStageQueryEngine(useMultiStageQueryEngine); //Group By Query - String query = "Select" + " jsonExtractScalar(complexMapStr,'$.k1','STRING')," - + " sum(jsonExtractScalar(complexMapStr,'$.k4.met','INT'))" + " from " + getTableName() - + " group by jsonExtractScalar(complexMapStr,'$.k1','STRING')" - + " order by sum(jsonExtractScalar(complexMapStr,'$.k4.met','INT')) DESC"; + String groupExpr = "jsonExtractScalar(complexMapStr,'$.k1','STRING')"; + String sumExpr = "SUM(jsonExtractScalar(complexMapStr,'$.k4.met','INT'))"; + String query = "SELECT " + groupExpr + ", " + sumExpr + " FROM " + getTableName() + + " GROUP BY " + groupExpr + " ORDER BY " + sumExpr + " DESC"; JsonNode pinotResponse = postQuery(query); - Assert.assertNotNull(pinotResponse.get("resultTable").get("rows")); + assertNotNull(pinotResponse.get("resultTable").get("rows")); ArrayNode rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); for (int i = 0; i < rows.size(); i++) { String seqId = _sortedSequenceIds.get(NUM_DOCS_PER_SEGMENT - 1 - i); - final JsonNode row = rows.get(i); - Assert.assertEquals(row.get(0).asText(), "value-k1-" + seqId); - Assert.assertEquals(row.get(1).asDouble(), Double.parseDouble(seqId) * getNumAvroFiles()); + JsonNode row = rows.get(i); + assertEquals(row.get(0).asText(), "value-k1-" + seqId); + assertEquals(row.get(1).asDouble(), Double.parseDouble(seqId) * getNumAvroFiles()); } } @@ -284,18 +299,18 @@ public void testComplexGroupByQueryV2(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); //Group By Query - String query = "Select" + " jsonExtractScalar(complexMapStr,'$.k1','STRING')," - + " sum(jsonExtractScalar(complexMapStr,'$.k4.met','INT'))" + " from " + getTableName() - + " group by jsonExtractScalar(complexMapStr,'$.k1','STRING')" - + " order by sum(jsonExtractScalar(complexMapStr,'$.k4.met','INT')) DESC"; + String groupExpr = "jsonExtractScalar(complexMapStr,'$.k1','STRING')"; + String sumExpr = "SUM(jsonExtractScalar(complexMapStr,'$.k4.met','INT'))"; + String query = "SELECT " + groupExpr + ", " + sumExpr + " FROM " + getTableName() + + " GROUP BY " + groupExpr + " ORDER BY " + sumExpr + " DESC"; JsonNode pinotResponse = postQuery(query); - Assert.assertNotNull(pinotResponse.get("resultTable").get("rows")); + assertNotNull(pinotResponse.get("resultTable").get("rows")); ArrayNode rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); for (int i = 0; i < rows.size(); i++) { String seqId = String.valueOf(NUM_DOCS_PER_SEGMENT - 1 - i); - final JsonNode row = rows.get(i); - Assert.assertEquals(row.get(0).asText(), "value-k1-" + seqId); - Assert.assertEquals(row.get(1).asDouble(), Double.parseDouble(seqId) * getNumAvroFiles()); + JsonNode row = rows.get(i); + assertEquals(row.get(0).asText(), "value-k1-" + seqId); + assertEquals(row.get(1).asDouble(), Double.parseDouble(seqId) * getNumAvroFiles()); } } @@ -304,17 +319,17 @@ public void testQueryWithIntegerDefault(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); //Group By Query - String query = "Select" + " jsonExtractScalar(complexMapStr,'$.inExistKey','STRING','defaultKey')," - + " sum(jsonExtractScalar(complexMapStr,'$.inExistMet','INT','1'))" + " from " + getTableName() - + " group by jsonExtractScalar(complexMapStr,'$.inExistKey','STRING','defaultKey')" - + " order by sum(jsonExtractScalar(complexMapStr,'$.inExistMet','INT','1')) DESC"; + String groupExpr = "jsonExtractScalar(complexMapStr,'$.inExistKey','STRING','defaultKey')"; + String sumExpr = "SUM(jsonExtractScalar(complexMapStr,'$.inExistMet','INT','1'))"; + String query = "SELECT " + groupExpr + ", " + sumExpr + " FROM " + getTableName() + + " GROUP BY " + groupExpr + " ORDER BY " + sumExpr + " DESC"; JsonNode pinotResponse = postQuery(query); - Assert.assertNotNull(pinotResponse.get("resultTable").get("rows")); + assertNotNull(pinotResponse.get("resultTable").get("rows")); ArrayNode rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertEquals(rows.size(), 1); - final JsonNode row = rows.get(0); - Assert.assertEquals(row.get(0).asText(), "defaultKey"); - Assert.assertEquals(row.get(1).asDouble(), 1000.0 * getNumAvroFiles()); + assertEquals(rows.size(), 1); + JsonNode row = rows.get(0); + assertEquals(row.get(0).asText(), "defaultKey"); + assertEquals(row.get(1).asDouble(), 1000.0 * getNumAvroFiles()); } @Test(dataProvider = "useBothQueryEngines") @@ -322,24 +337,24 @@ public void testQueryWithDoubleDefault(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); //Group By Query - String query = "Select" + " jsonExtractScalar(complexMapStr,'$.inExistKey','STRING', 'defaultKey')," - + " sum(jsonExtractScalar(complexMapStr,'$.inExistMet','DOUBLE','0.1'))" + " from " + getTableName() - + " group by jsonExtractScalar(complexMapStr,'$.inExistKey','STRING','defaultKey')" - + " order by sum(jsonExtractScalar(complexMapStr,'$.inExistMet','DOUBLE','0.1')) DESC"; + String groupExpr = "jsonExtractScalar(complexMapStr,'$.inExistKey','STRING','defaultKey')"; + String sumExpr = "SUM(jsonExtractScalar(complexMapStr,'$.inExistMet','DOUBLE','0.1'))"; + String query = "SELECT " + groupExpr + ", " + sumExpr + " FROM " + getTableName() + + " GROUP BY " + groupExpr + " ORDER BY " + sumExpr + " DESC"; JsonNode pinotResponse = postQuery(query); - Assert.assertNotNull(pinotResponse.get("resultTable").get("rows")); + assertNotNull(pinotResponse.get("resultTable").get("rows")); ArrayNode rows = (ArrayNode) pinotResponse.get("resultTable").get("rows"); - Assert.assertEquals(rows.size(), 1); - final JsonNode row = rows.get(0); - Assert.assertEquals(row.get(0).asText(), "defaultKey"); - Assert.assertTrue(Math.abs(row.get(1).asDouble() - 100.0 * getNumAvroFiles()) < 1e-10); + assertEquals(rows.size(), 1); + JsonNode row = rows.get(0); + assertEquals(row.get(0).asText(), "defaultKey"); + assertTrue(Math.abs(row.get(1).asDouble() - 100.0 * getNumAvroFiles()) < 1e-10); } @Test(dataProvider = "useBothQueryEngines") void testFailedQuery(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "Select jsonExtractScalar(myMapStr,\"$.k1\",\"STRING\") from " + getTableName(); + String query = "SELECT jsonExtractScalar(myMapStr,\"$.k1\",\"STRING\") FROM " + getTableName(); JsonNode pinotResponse = postQuery(query); int expectedStatusCode; if (useMultiStageQueryEngine) { @@ -347,30 +362,30 @@ void testFailedQuery(boolean useMultiStageQueryEngine) } else { expectedStatusCode = QueryErrorCode.SQL_PARSING.getId(); } - Assert.assertEquals(pinotResponse.get("exceptions").get(0).get("errorCode").asInt(), expectedStatusCode); - Assert.assertEquals(pinotResponse.get("numDocsScanned").asInt(), 0); - Assert.assertEquals(pinotResponse.get("totalDocs").asInt(), 0); + assertEquals(pinotResponse.get("exceptions").get(0).get("errorCode").asInt(), expectedStatusCode); + assertEquals(pinotResponse.get("numDocsScanned").asInt(), 0); + assertEquals(pinotResponse.get("totalDocs").asInt(), 0); - query = "Select myMapStr from " + getTableName() - + " where jsonExtractScalar(myMapStr, '$.k1',\"STRING\") = 'value-k1-0'"; + query = "SELECT myMapStr FROM " + getTableName() + + " WHERE jsonExtractScalar(myMapStr, '$.k1',\"STRING\") = 'value-k1-0'"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").get(0).get("errorCode").asInt(), expectedStatusCode); - Assert.assertEquals(pinotResponse.get("numDocsScanned").asInt(), 0); - Assert.assertEquals(pinotResponse.get("totalDocs").asInt(), 0); + assertEquals(pinotResponse.get("exceptions").get(0).get("errorCode").asInt(), expectedStatusCode); + assertEquals(pinotResponse.get("numDocsScanned").asInt(), 0); + assertEquals(pinotResponse.get("totalDocs").asInt(), 0); - query = "Select jsonExtractScalar(myMapStr,\"$.k1\", 'STRING') from " + getTableName() - + " where jsonExtractScalar(myMapStr, '$.k1', 'STRING') = 'value-k1-0'"; + query = "SELECT jsonExtractScalar(myMapStr,\"$.k1\", 'STRING') FROM " + getTableName() + + " WHERE jsonExtractScalar(myMapStr, '$.k1', 'STRING') = 'value-k1-0'"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").get(0).get("errorCode").asInt(), expectedStatusCode); - Assert.assertEquals(pinotResponse.get("numDocsScanned").asInt(), 0); - Assert.assertEquals(pinotResponse.get("totalDocs").asInt(), 0); + assertEquals(pinotResponse.get("exceptions").get(0).get("errorCode").asInt(), expectedStatusCode); + assertEquals(pinotResponse.get("numDocsScanned").asInt(), 0); + assertEquals(pinotResponse.get("totalDocs").asInt(), 0); } @Test public void testJsonPathCache() { Cache cache = CacheProvider.getCache(); - Assert.assertTrue(cache instanceof JsonPathCache); - Assert.assertTrue(((JsonPathCache) cache).size() > 0); + assertTrue(cache instanceof JsonPathCache); + assertTrue(((JsonPathCache) cache).size() > 0); } @Test(dataProvider = "useBothQueryEngines") @@ -379,37 +394,37 @@ public void testJsonKeysQueries(boolean useMultiStageQueryEngine) setUseMultiStageQueryEngine(useMultiStageQueryEngine); String query = "SELECT jsonExtractKey(myMapStr, '$.*', 'maxDepth=1') FROM " + getTableName() + " LIMIT 1"; JsonNode pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); JsonNode rows = pinotResponse.get("resultTable").get("rows"); - Assert.assertEquals(rows.size(), 1); + assertEquals(rows.size(), 1); JsonNode row = rows.get(0); - Assert.assertEquals(row.size(), 1); + assertEquals(row.size(), 1); // JsonPath returns keys in JsonPath format like "$['key']" JsonNode keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertTrue(keys.size() > 0); + assertTrue(keys.isArray()); + assertFalse(keys.isEmpty()); query = "SELECT jsonExtractKey(complexMapStr, '$.*', 'maxDepth=2') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); - Assert.assertEquals(rows.size(), 1); + assertEquals(rows.size(), 1); row = rows.get(0); - Assert.assertEquals(row.size(), 1); + assertEquals(row.size(), 1); keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertTrue(keys.size() > 0); + assertTrue(keys.isArray()); + assertFalse(keys.isEmpty()); query = "SELECT jsonExtractKey(complexMapStr, '$.*', 'maxDepth=3') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); - Assert.assertEquals(rows.size(), 1); + assertEquals(rows.size(), 1); row = rows.get(0); - Assert.assertEquals(row.size(), 1); + assertEquals(row.size(), 1); keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertTrue(keys.size() > 0); + assertTrue(keys.isArray()); + assertFalse(keys.isEmpty()); } @Test(dataProvider = "useBothQueryEngines") @@ -421,180 +436,180 @@ public void testJsonKeysQueriesWithDotNotation(boolean useMultiStageQueryEngine) String query = "SELECT jsonExtractKey(myMapStr, '$.*', 'maxDepth=1; dotNotation=true') FROM " + getTableName() + " LIMIT 1"; JsonNode pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); JsonNode rows = pinotResponse.get("resultTable").get("rows"); - Assert.assertEquals(rows.size(), 1); + assertEquals(rows.size(), 1); JsonNode row = rows.get(0); - Assert.assertEquals(row.size(), 1); + assertEquals(row.size(), 1); JsonNode keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertEquals(keys.size(), 2); // k1, k2 + assertTrue(keys.isArray()); + assertEquals(keys.size(), 2); // k1, k2 // Should contain simple key names, not JsonPath format List keyList = new ArrayList<>(); for (JsonNode key : keys) { keyList.add(key.asText()); } - Assert.assertTrue(keyList.contains("k1")); - Assert.assertTrue(keyList.contains("k2")); + assertTrue(keyList.contains("k1")); + assertTrue(keyList.contains("k2")); // Should NOT contain JsonPath format like "$['k1']" - Assert.assertFalse(keyList.contains("$['k1']")); - Assert.assertFalse(keyList.contains("$['k2']")); + assertFalse(keyList.contains("$['k1']")); + assertFalse(keyList.contains("$['k2']")); // Test optional parameter jsonExtractKey with dotNotation=false (JsonPath format) query = "SELECT jsonExtractKey(myMapStr, '$.*', 'maxDepth=1; dotNotation=false') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertEquals(keys.size(), 2); + assertTrue(keys.isArray()); + assertEquals(keys.size(), 2); keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should contain JsonPath format - Assert.assertTrue(keyList.contains("$['k1']")); - Assert.assertTrue(keyList.contains("$['k2']")); + assertTrue(keyList.contains("$['k1']")); + assertTrue(keyList.contains("$['k2']")); // Test recursive key extraction with dot notation on complex JSON query = "SELECT jsonExtractKey(complexMapStr, '$..**', 'maxDepth=2; dotNotation=true') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertTrue(keys.size() >= 4); // At least k1, k2, k3, k4 + assertTrue(keys.isArray()); + assertTrue(keys.size() >= 4); // At least k1, k2, k3, k4 keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should contain top-level keys in dot notation - Assert.assertTrue(keyList.contains("k1")); - Assert.assertTrue(keyList.contains("k2")); - Assert.assertTrue(keyList.contains("k3")); - Assert.assertTrue(keyList.contains("k4")); + assertTrue(keyList.contains("k1")); + assertTrue(keyList.contains("k2")); + assertTrue(keyList.contains("k3")); + assertTrue(keyList.contains("k4")); // Test recursive key extraction with JsonPath format query = "SELECT jsonExtractKey(complexMapStr, '$..**', 'maxDepth=2; dotNotation=false') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); + assertTrue(keys.isArray()); keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should contain JsonPath format - Assert.assertTrue(keyList.contains("$['k1']")); - Assert.assertTrue(keyList.contains("$['k2']")); - Assert.assertTrue(keyList.contains("$['k3']")); - Assert.assertTrue(keyList.contains("$['k4']")); + assertTrue(keyList.contains("$['k1']")); + assertTrue(keyList.contains("$['k2']")); + assertTrue(keyList.contains("$['k3']")); + assertTrue(keyList.contains("$['k4']")); // Test deeper recursive extraction with dot notation query = "SELECT jsonExtractKey(complexMapStr, '$..**', 'maxDepth=3; dotNotation=true') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertTrue(keys.size() > 4); // Should include nested keys + assertTrue(keys.isArray()); + assertTrue(keys.size() > 4); // Should include nested keys keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should contain nested keys in dot notation - Assert.assertTrue(keyList.contains("k4.k4-k1")); - Assert.assertTrue(keyList.contains("k4.k4-k2")); - Assert.assertTrue(keyList.contains("k4.k4-k3")); - Assert.assertTrue(keyList.contains("k4.met")); + assertTrue(keyList.contains("k4.k4-k1")); + assertTrue(keyList.contains("k4.k4-k2")); + assertTrue(keyList.contains("k4.k4-k3")); + assertTrue(keyList.contains("k4.met")); // Should contain array indices in dot notation - Assert.assertTrue(keyList.contains("k3.0")); - Assert.assertTrue(keyList.contains("k3.1")); - Assert.assertTrue(keyList.contains("k3.2")); + assertTrue(keyList.contains("k3.0")); + assertTrue(keyList.contains("k3.1")); + assertTrue(keyList.contains("k3.2")); // Test deeper recursive extraction with JsonPath format query = "SELECT jsonExtractKey(complexMapStr, '$..**', 'maxDepth=3; dotNotation=false') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); + assertTrue(keys.isArray()); keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should contain nested keys in JsonPath format - Assert.assertTrue(keyList.contains("$['k4']['k4-k1']")); - Assert.assertTrue(keyList.contains("$['k4']['k4-k2']")); - Assert.assertTrue(keyList.contains("$['k4']['k4-k3']")); - Assert.assertTrue(keyList.contains("$['k4']['met']")); + assertTrue(keyList.contains("$['k4']['k4-k1']")); + assertTrue(keyList.contains("$['k4']['k4-k2']")); + assertTrue(keyList.contains("$['k4']['k4-k3']")); + assertTrue(keyList.contains("$['k4']['met']")); // Should contain array indices in JsonPath format - Assert.assertTrue(keyList.contains("$['k3'][0]")); - Assert.assertTrue(keyList.contains("$['k3'][1]")); - Assert.assertTrue(keyList.contains("$['k3'][2]")); + assertTrue(keyList.contains("$['k3'][0]")); + assertTrue(keyList.contains("$['k3'][1]")); + assertTrue(keyList.contains("$['k3'][2]")); // Test specific path extraction with dot notation query = "SELECT jsonExtractKey(complexMapStr, '$.k4.*', 'maxDepth=2; dotNotation=true') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); - Assert.assertEquals(keys.size(), 4); // k4-k1, k4-k2, k4-k3, met + assertTrue(keys.isArray()); + assertEquals(keys.size(), 4); // k4-k1, k4-k2, k4-k3, met keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should contain nested keys in dot notation format - Assert.assertTrue(keyList.contains("k4.k4-k1")); - Assert.assertTrue(keyList.contains("k4.k4-k2")); - Assert.assertTrue(keyList.contains("k4.k4-k3")); - Assert.assertTrue(keyList.contains("k4.met")); + assertTrue(keyList.contains("k4.k4-k1")); + assertTrue(keyList.contains("k4.k4-k2")); + assertTrue(keyList.contains("k4.k4-k3")); + assertTrue(keyList.contains("k4.met")); // Test backward compatibility - 2-parameter version should default to JsonPath format query = "SELECT jsonExtractKey(myMapStr, '$.*') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); + assertTrue(keys.isArray()); keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should default to JsonPath format - Assert.assertTrue(keyList.contains("$['k1']")); - Assert.assertTrue(keyList.contains("$['k2']")); + assertTrue(keyList.contains("$['k1']")); + assertTrue(keyList.contains("$['k2']")); // Test backward compatibility - no dotNotation should default to JsonPath format query = "SELECT jsonExtractKey(myMapStr, '$.*', 'maxDepth=1') FROM " + getTableName() + " LIMIT 1"; pinotResponse = postQuery(query); - Assert.assertEquals(pinotResponse.get("exceptions").size(), 0); + assertEquals(pinotResponse.get("exceptions").size(), 0); rows = pinotResponse.get("resultTable").get("rows"); row = rows.get(0); keys = row.get(0); - Assert.assertTrue(keys.isArray()); + assertTrue(keys.isArray()); keyList.clear(); for (JsonNode key : keys) { keyList.add(key.asText()); } // Should default to JsonPath format - Assert.assertTrue(keyList.contains("$['k1']")); - Assert.assertTrue(keyList.contains("$['k2']")); + assertTrue(keyList.contains("$['k1']")); + assertTrue(keyList.contains("$['k2']")); } // --- JsonIndexDistinctOperator tests (useIndexBasedDistinctOperator) --- @@ -608,79 +623,77 @@ public void testJsonIndexDistinctOperatorDisabledByDefault(boolean useMultiStage throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode response = postQuery(query); - Assert.assertEquals(response.get("exceptions").size(), 0); + assertEquals(response.get("exceptions").size(), 0); List values = extractOrderedDistinctValues(response); - Assert.assertFalse(values.isEmpty(), - "Baseline (operator disabled) should return distinct values. Engine=" - + (useMultiStageQueryEngine ? "MSE" : "SSE")); + assertFalse(values.isEmpty(), + "Baseline (operator disabled) should return distinct values. Engine=" + (useMultiStageQueryEngine ? "MSE" + : "SSE")); } /** * With useIndexBasedDistinctOperator, JsonIndexDistinctOperator produces same results as baseline. - * Compares ordered rows (not just sets) to verify ORDER BY semantics. - * For SSE, verifies numEntriesScannedPostFilter=0 (index path, no doc scan). + * Compares ordered rows (not just sets) to verify ORDER BY semantics. The numEntriesScannedPostFilter assertion + * pins the operator's per-value iteration: one increment per entry in the value-to-docs map, so the expected + * count is NUM_DISTINCT_K1 * getNumAvroFiles(). */ @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctOperatorWithPinotJsonIndex(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "JsonIndexDistinctOperator should produce same ordered results as baseline. " - + "Engine=" + (useMultiStageQueryEngine ? "MSE" : "SSE")); + assertEquals(optimizedRows, baselineRows, + "JsonIndexDistinctOperator should produce same ordered results as baseline. " + "Engine=" + ( + useMultiStageQueryEngine ? "MSE" : "SSE")); - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L, - "JsonIndexDistinctOperator (SSE) uses index only (numEntriesScannedPostFilter=0)."); - } + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), NUM_DISTINCT_K1 * getNumAvroFiles()); } /** - * JsonIndexDistinctOperator with filter produces same ordered results as baseline. - * For SSE, verifies numEntriesScannedPostFilter=0 (index path, no doc scan). + * JsonIndexDistinctOperator with a WHERE filter on a different path produces the same ordered results as the + * baseline. The operator still iterates every entry in the $.k1 value-to-docs map (the WHERE filter is applied + * via per-entry bitmap intersection, not by shrinking the map), so numEntriesScannedPostFilter is + * NUM_DISTINCT_K1 * getNumAvroFiles() — same as the no-filter case. */ @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctOperatorWithFilter(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k2', 'STRING') = 'value-k2-0'" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String k1Expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING')"; + String k2Expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k2', 'STRING')"; + String query = "SELECT DISTINCT " + k1Expr + " FROM " + getTableName() + " WHERE " + k2Expr + " = 'value-k2-0'" + + " ORDER BY " + k1Expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "JsonIndexDistinctOperator with filter should match baseline. Engine=" - + (useMultiStageQueryEngine ? "MSE" : "SSE")); + assertEquals(optimizedRows, baselineRows, + "JsonIndexDistinctOperator with filter should match baseline. Engine=" + (useMultiStageQueryEngine ? "MSE" + : "SSE")); - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L, - "JsonIndexDistinctOperator with filter (SSE) uses index only (numEntriesScannedPostFilter=0)."); - } + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), NUM_DISTINCT_K1 * getNumAvroFiles()); } /** - * Verifies that JsonIndexDistinctOperator correctly materializes the defaultValue for docs where the JSON path + * Verifies that JsonIndexDistinctOperator correctly materializes the defaultValue for docs WHERE the JSON path * is absent, matching baseline JsonExtractIndexTransformFunction behavior. */ @Test(dataProvider = "useBothQueryEngines") @@ -689,24 +702,25 @@ public void testJsonIndexDistinctOperatorWithDefaultValue(boolean useMultiStageQ setUseMultiStageQueryEngine(useMultiStageQueryEngine); // Query a non-existent path with a defaultValue — all docs should produce the default - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME - + ", '$.nonexistent', 'STRING', 'N/A') FROM " + getTableName() - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.nonexistent', 'STRING', 'N/A') LIMIT 10"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.nonexistent', 'STRING', 'N/A')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "JsonIndexDistinctOperator with defaultValue should match baseline. Engine=" - + (useMultiStageQueryEngine ? "MSE" : "SSE")); - Assert.assertTrue(optimizedRows.contains("N/A"), - "defaultValue 'N/A' should appear in results for non-existent path. Engine=" - + (useMultiStageQueryEngine ? "MSE" : "SSE")); + assertEquals(optimizedRows, baselineRows, + "JsonIndexDistinctOperator with defaultValue should match baseline. Engine=" + (useMultiStageQueryEngine ? "MSE" + : "SSE")); + assertTrue(optimizedRows.contains("N/A"), + "defaultValue 'N/A' should appear in results for non-existent path. Engine=" + (useMultiStageQueryEngine ? "MSE" + : "SSE")); + + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), 0); } /** @@ -720,262 +734,259 @@ public void testJsonIndexDistinctOperatorMissingPathNoDefault(boolean useMultiSt setUseMultiStageQueryEngine(useMultiStageQueryEngine); // Query a non-existent path WITHOUT defaultValue — should produce an error - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME - + ", '$.nonexistent', 'STRING') FROM " + getTableName() + " LIMIT 10"; + String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.nonexistent', 'STRING') FROM " + + getTableName() + " LIMIT 10"; // Baseline also throws for missing path without defaultValue JsonNode baselineResponse = postQuery(query); - Assert.assertTrue(baselineResponse.get("exceptions").size() > 0, + assertFalse(baselineResponse.get("exceptions").isEmpty(), "Baseline should throw for missing JSON path without defaultValue"); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertTrue(optimizedResponse.get("exceptions").size() > 0, + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertFalse(optimizedResponse.get("exceptions").isEmpty(), "JsonIndexDistinctOperator should throw for missing JSON path without defaultValue"); } - // --- Same-path JSON_MATCH predicate tests (trigger getMatchingDistinctValues fast path) --- + // --- 5-arg jsonExtractIndex(column, path, type, default, filterJsonExpression) tests --- + // + // The 5-arg form pushes the JSON_MATCH-style filter into the JSON-index lookup itself. Each filter that doesn't + // match every doc causes `handleMissingDocs` (or the transform's per-row default branch) to add the literal + // default to the distinct set, so the expected result is `{matching values} ∪ {default}`. - /** - * Same-path REGEXP_LIKE: fully pushed down, single dict scan, no posting list reads. - */ + /// REGEXP_LIKE pushed down via the 5-arg filterJsonExpression. @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctSamePathRegexpLike(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - // REGEXP_LIKE on $.k1 matching a subset of values - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME - + ", 'REGEXP_LIKE(\"$.k1\", ''value-k1-[0-9]'')')" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', 'REGEXP_LIKE(\"$.k1\", ''value-k1-[0-9]'')')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertFalse(baselineRows.isEmpty(), "REGEXP_LIKE should match single-digit k1 values"); - Assert.assertEquals(optimizedRows, baselineRows, - "Same-path REGEXP_LIKE fast path should match baseline"); + assertEquals(optimizedRows, baselineRows, "5-arg REGEXP_LIKE should match baseline"); + // Single-digit suffix matches value-k1-0..value-k1-9 (10 values); non-matching docs add 'missing'. + assertEquals(optimizedRows.size(), 11); + assertTrue(optimizedRows.contains("missing")); - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); - } + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), 10 * getNumAvroFiles()); } - /** - * Same-path EQ: fully pushed down. - */ + /// EQ pushed down via the 5-arg filterJsonExpression. @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctSamePathEq(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME - + ", '\"$.k1\" = ''value-k1-0''')" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k1\" = ''value-k1-0''')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "Same-path EQ fast path should match baseline"); - Assert.assertTrue(optimizedRows.contains("value-k1-0")); + assertEquals(optimizedRows, baselineRows, "5-arg EQ should match baseline"); + assertEquals(optimizedRows, List.of("missing", "value-k1-0")); - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); - } + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), getNumAvroFiles()); } - /** - * Same-path NOT_EQ: fully pushed down. - */ + /// NOT_EQ pushed down via the 5-arg filterJsonExpression. @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctSamePathNotEq(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME - + ", '\"$.k1\" != ''value-k1-0''')" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k1\" != ''value-k1-0''')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "Same-path NOT_EQ fast path should match baseline"); - Assert.assertFalse(optimizedRows.contains("value-k1-0")); - - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); - } + assertEquals(optimizedRows, baselineRows, "5-arg NOT_EQ should match baseline"); + // 99 matching k1 values (everything except value-k1-0) + 'missing' for the excluded docs. + assertEquals(optimizedRows.size(), NUM_DISTINCT_K1); + assertFalse(optimizedRows.contains("value-k1-0")); + assertTrue(optimizedRows.contains("missing")); + + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), + (NUM_DISTINCT_K1 - 1) * getNumAvroFiles()); } - /** - * Same-path IN: fully pushed down. - */ + /// IN pushed down via the 5-arg filterJsonExpression. @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctSamePathIn(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME - + ", '\"$.k1\" IN (''value-k1-0'', ''value-k1-1'', ''value-k1-2'')')" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k1\" IN (''value-k1-0'', ''value-k1-1'', ''value-k1-2'')')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "Same-path IN fast path should match baseline"); - Assert.assertEquals(optimizedRows.size(), 3); + assertEquals(optimizedRows, baselineRows, "5-arg IN should match baseline"); + assertEquals(optimizedRows, List.of("missing", "value-k1-0", "value-k1-1", "value-k1-2")); - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); - } + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), 3 * getNumAvroFiles()); } - /** - * Same-path NOT_IN: fully pushed down. - */ + /// NOT_IN pushed down via the 5-arg filterJsonExpression. @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctSamePathNotIn(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME - + ", '\"$.k1\" NOT IN (''value-k1-0'', ''value-k1-1'')')" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k1\" NOT IN (''value-k1-0'', ''value-k1-1'')')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "Same-path NOT_IN fast path should match baseline"); - Assert.assertFalse(optimizedRows.contains("value-k1-0")); - Assert.assertFalse(optimizedRows.contains("value-k1-1")); - - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); - } + assertEquals(optimizedRows, baselineRows, "5-arg NOT_IN should match baseline"); + // 98 matching k1 values + 'missing' for the excluded docs. + assertEquals(optimizedRows.size(), NUM_DISTINCT_K1 - 1); + assertFalse(optimizedRows.contains("value-k1-0")); + assertFalse(optimizedRows.contains("value-k1-1")); + assertTrue(optimizedRows.contains("missing")); + + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), + (NUM_DISTINCT_K1 - 2) * getNumAvroFiles()); } - /** - * Same-path IS NOT NULL: fully pushed down. - */ + /// IS NOT NULL pushed down via the 5-arg filterJsonExpression. The filter matches every doc (every row has + /// `$.k1`), so the literal default is never added and the result is exactly the distinct `$.k1` set. @Test(dataProvider = "useBothQueryEngines") public void testJsonIndexDistinctSamePathIsNotNull(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME + ", '\"$.k1\" IS NOT NULL')" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k1\" IS NOT NULL')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "Same-path IS NOT NULL fast path should match baseline"); - Assert.assertEquals(optimizedRows.size(), NUM_DOCS_PER_SEGMENT, - "IS NOT NULL should return all values since every doc has $.k1"); + assertEquals(optimizedRows, baselineRows, "5-arg IS NOT NULL should match baseline"); + assertEquals(optimizedRows.size(), NUM_DISTINCT_K1); + assertFalse(optimizedRows.contains("missing"), + "Filter matches every doc, so the default literal should never be added"); - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); - } + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), NUM_DISTINCT_K1 * getNumAvroFiles()); } - /** - * Same-path REGEXP_LIKE with 4-arg form (defaultValue): fully pushed down fast path still works with defaults. - */ + /// 5-arg filterJsonExpression with LIMIT (no ORDER BY): only the LIMIT row-count is enforced. @Test(dataProvider = "useBothQueryEngines") - public void testJsonIndexDistinctSamePathRegexpLikeWithDefault(boolean useMultiStageQueryEngine) + public void testJsonIndexDistinctSamePathWithLimit(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME - + ", '$.k1', 'STRING', 'fallback') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME - + ", 'REGEXP_LIKE(\"$.k1\", ''value-k1-[0-9]'')')" - + " ORDER BY jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING', 'fallback') LIMIT 10000"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k1\" IS NOT NULL')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " LIMIT 5"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); - List baselineRows = extractOrderedDistinctValues(baselineResponse); - List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows, baselineRows, - "Same-path REGEXP_LIKE 4-arg fast path should match baseline"); - // The default should NOT appear since the filter only matches docs that HAVE $.k1 - Assert.assertFalse(optimizedRows.contains("fallback"), - "Same-path filter ensures all matching docs have the path, so no default should appear"); + assertEquals(extractOrderedDistinctValues(baselineResponse).size(), 5); + assertEquals(extractOrderedDistinctValues(optimizedResponse).size(), 5); + // TODO: Fix LIMIT push down for MSE if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), 5 * getNumAvroFiles()); } } - /** - * Same-path REGEXP_LIKE without ORDER BY: verify LIMIT is respected with fast path. - */ + /// Cross-path 5-arg form: filter on `$.k2`, extract `$.k1`. `getMatchingFlattenedDocsMap` applies the filter + /// independently of the extracted path, so the returned value-to-docs map holds the `$.k1` values for only the + /// docs satisfying `$.k2 = 'value-k2-0'`. `$.k2` is unique across the segment, so exactly one doc matches per + /// segment, and that doc's `$.k1` is `value-k1-0`. Every other doc falls through to the literal default. @Test(dataProvider = "useBothQueryEngines") - public void testJsonIndexDistinctSamePathWithLimit(boolean useMultiStageQueryEngine) + public void testJsonIndexDistinctCrossPathFilter(boolean useMultiStageQueryEngine) throws Exception { setUseMultiStageQueryEngine(useMultiStageQueryEngine); - String query = "SELECT DISTINCT jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + ", '$.k1', 'STRING') FROM " - + getTableName() + " WHERE JSON_MATCH(" + MY_MAP_STR_FIELD_NAME - + ", '\"$.k1\" IS NOT NULL') LIMIT 5"; + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k2\" = ''value-k2-0''')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; JsonNode baselineResponse = postQuery(query); - Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + assertTrue(baselineResponse.get("exceptions").isEmpty()); - JsonNode optimizedResponse = postQueryWithOptions(query, USE_INDEX_BASED_DISTINCT_OPERATOR + "=true"); - Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + JsonNode optimizedResponse = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(optimizedResponse.get("exceptions").isEmpty()); List baselineRows = extractOrderedDistinctValues(baselineResponse); List optimizedRows = extractOrderedDistinctValues(optimizedResponse); - Assert.assertEquals(optimizedRows.size(), 5, "LIMIT 5 should be respected by fast path"); - Assert.assertEquals(baselineRows.size(), 5); + assertEquals(optimizedRows, baselineRows, "Cross-path 5-arg filter should match baseline"); + assertEquals(optimizedRows, List.of("missing", "value-k1-0")); - if (!useMultiStageQueryEngine) { - Assert.assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asLong(), 0L); - } + assertEquals(optimizedResponse.get("numEntriesScannedPostFilter").asInt(), getNumAvroFiles()); + } + + /// The new `jsonIndexDistinctSkipMissingPath` query option suppresses `handleMissingDocs`, so the literal default + /// never appears in the result — even when the 5-arg filter excludes most docs. Same EQ-filter query as + /// `testJsonIndexDistinctSamePathEq`: without the option the operator returns `[missing, value-k1-0]`; with the + /// option it collapses to `[value-k1-0]`. + @Test(dataProvider = "useBothQueryEngines") + public void testJsonIndexDistinctSkipMissingPath(boolean useMultiStageQueryEngine) + throws Exception { + setUseMultiStageQueryEngine(useMultiStageQueryEngine); + + String expr = "jsonExtractIndex(" + MY_MAP_STR_FIELD_NAME + + ", '$.k1', 'STRING', 'missing', '\"$.k1\" = ''value-k1-0''')"; + String query = "SELECT DISTINCT " + expr + " FROM " + getTableName() + " ORDER BY " + expr + " LIMIT 10000"; + + // Operator without skip: same shape as testJsonIndexDistinctSamePathEq — default appears. + JsonNode withoutSkip = postQueryWithOptions(query, OPT_USE_INDEX); + assertTrue(withoutSkip.get("exceptions").isEmpty()); + assertEquals(extractOrderedDistinctValues(withoutSkip), List.of("missing", "value-k1-0")); + + // Operator with skip: default is never added. + JsonNode withSkip = postQueryWithOptions(query, OPT_USE_INDEX_SKIP_MISSING_PATH); + assertTrue(withSkip.get("exceptions").isEmpty()); + assertEquals(extractOrderedDistinctValues(withSkip), List.of("value-k1-0")); + + assertEquals(withSkip.get("numEntriesScannedPostFilter").asInt(), getNumAvroFiles()); } private static List extractOrderedDistinctValues(JsonNode response) { diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java index 840dd97db515..a61a4ba93b3c 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java @@ -708,6 +708,11 @@ public static class QueryOptionKey { * 30 for dictCard <= 1K, 10 for dictCard <= 10K, 6 for dictCard > 10K. */ public static final String INVERTED_INDEX_DISTINCT_COST_RATIO = "invertedIndexDistinctCostRatio"; + /// When true, `JsonIndexDistinctOperator` skips missing-path handling — it does not add a 4-arg default + /// value, does not add null (even when `nullHandling` is enabled), and does not throw `Illegal Json Path`. + /// The result is purely the distinct values produced by the JSON-index lookup (filtered by the optional + /// 5-arg `jsonFilterExpression` and intersected with the `WHERE`-clause filter). + public static final String JSON_INDEX_DISTINCT_SKIP_MISSING_PATH = "jsonIndexDistinctSkipMissingPath"; public static final String SCAN_STAR_TREE_NODES = "scanStarTreeNodes"; public static final String ROUTING_OPTIONS = "routingOptions"; public static final String TABLE_SAMPLER = "sampler";