From ca749208e4d2b6581f727cbfdbd14ee538be2328 Mon Sep 17 00:00:00 2001 From: Xiang Fu Date: Mon, 11 May 2026 02:20:17 -0700 Subject: [PATCH] =?UTF-8?q?[feature]=20SSE=20Materialized=20View=20?= =?UTF-8?q?=E2=80=94=20view=20creation,=20ingestion=20(PR=201=20of=202)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR 1 of a 2-PR split. This change introduces the view-creation and materialization pipeline; broker-side query rewrite ships in PR 2. **Scope of PR 1:** - MV table definition (analyzer, time-expression validator, ZK metadata) - Per-partition runtime metadata (PartitionInfo, PartitionState, PartitionFingerprint) and storage utilities - Controller-side consistency manager (subscribes to base-table segment events; debounces and CAS-marks affected MV partitions STALE) - Controller integration (PinotHelixResourceManager / SegmentDeletionManager notify methods; BaseControllerStarter wires the manager before listeners fire) - Minion task pipeline (generator + executor + segment lineage replace, saturation-LIMIT gate, partition-fingerprint CAS write to runtime znode) - pinot-materialized-view module: analysis/, consistency/, metadata/, scheduler/, executor/, context/ - Constants moved from MinionConstants.MaterializedViewTask to CommonConstants.MaterializedViewTask (pinot-spi) - Quickstart: airlineStatsMv example (TIMESTAMP MV column derived from base via DaysSinceEpoch * 86400000), with per-aggregation MV-vs-base result comparison - DESIGN.md documenting the time-windowed model and the planned fixed-partition extension **Out of scope (deferred to PR 2):** - Broker query rewrite engine + subsumption strategies (AggregationEquivalenceRegistry and the equivalence rule classes land with the engine that actually applies them) - Broker metadata cache + handler + split dispatcher - BrokerResponse.materializedViewQueried response annotation - BaseSingleStageBrokerRequestHandler MV integration Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: Hongkun Xu --- .../common/metadata/ZKMetadataProvider.java | 20 + pinot-controller/pom.xml | 4 + .../controller/BaseControllerStarter.java | 40 +- .../helix/core/PinotHelixResourceManager.java | 225 +++- .../helix/core/SegmentDeletionManager.java | 49 + pinot-materialized-view/DESIGN.md | 233 ++++ pinot-materialized-view/pom.xml | 90 ++ .../analysis/MaterializedViewAnalyzer.java | 886 +++++++++++++ .../analysis/timeexpr/TimeExprValidator.java | 286 +++++ .../MaterializedViewConsistencyManager.java | 598 +++++++++ .../MaterializedViewTaskGeneratorContext.java | 85 ++ .../GrpcMaterializedViewQueryExecutor.java | 301 +++++ .../MaterializedViewQueryExecutor.java | 91 ++ .../MaterializedViewDefinitionMetadata.java | 234 ++++ ...terializedViewDefinitionMetadataUtils.java | 78 ++ .../MaterializedViewRuntimeMetadata.java | 136 ++ .../MaterializedViewRuntimeMetadataUtils.java | 121 ++ .../metadata/PartitionFingerprint.java | 155 +++ .../metadata/PartitionInfo.java | 131 ++ .../metadata/PartitionState.java | 58 + .../MaterializedViewTaskScheduler.java | 1009 +++++++++++++++ .../scheduler/MaterializedViewTaskUtils.java | 175 +++ .../MaterializedViewAnalyzerTest.java | 1108 +++++++++++++++++ .../timeexpr/TimeExprValidatorTest.java | 238 ++++ ...aterializedViewConsistencyManagerTest.java | 216 ++++ ...GrpcMaterializedViewQueryExecutorTest.java | 211 ++++ .../MaterializedViewMetadataTest.java | 155 +++ .../metadata/PartitionFingerprintTest.java | 132 ++ .../metadata/PartitionInfoTest.java | 138 ++ .../metadata/PartitionStateTest.java | 69 + .../MaterializedViewTaskSchedulerTest.java | 238 ++++ .../pinot-minion-builtin-tasks/pom.xml | 4 + .../MaterializedViewTaskExecutor.java | 827 ++++++++++++ .../MaterializedViewTaskExecutorFactory.java | 87 ++ .../MaterializedViewTaskGenerator.java | 120 ++ ...alizedViewTaskProgressObserverFactory.java | 34 + .../plugin/minion/tasks/TaskRegistryTest.java | 4 + .../MaterializedViewSegmentNameTest.java | 87 ++ .../MaterializedViewTaskExecutorTest.java | 283 +++++ .../pinot/spi/utils/CommonConstants.java | 135 ++ .../tools/MaterializedViewQuickStart.java | 389 ++++++ .../airlineStatsMv_offline_table_config.json | 26 + .../airlineStatsMv/airlineStatsMv_schema.json | 47 + pom.xml | 6 + 44 files changed, 9555 insertions(+), 4 deletions(-) create mode 100644 pinot-materialized-view/DESIGN.md create mode 100644 pinot-materialized-view/pom.xml create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzer.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidator.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManager.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/context/MaterializedViewTaskGeneratorContext.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutor.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/MaterializedViewQueryExecutor.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadata.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadataUtils.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadata.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadataUtils.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionFingerprint.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionInfo.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionState.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskScheduler.java create mode 100644 pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskUtils.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzerTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidatorTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManagerTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutorTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/MaterializedViewMetadataTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionFingerprintTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionInfoTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionStateTest.java create mode 100644 pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskSchedulerTest.java create mode 100644 pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutor.java create mode 100644 pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorFactory.java create mode 100644 pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskGenerator.java create mode 100644 pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskProgressObserverFactory.java create mode 100644 pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewSegmentNameTest.java create mode 100644 pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorTest.java create mode 100644 pinot-tools/src/main/java/org/apache/pinot/tools/MaterializedViewQuickStart.java create mode 100644 pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_offline_table_config.json create mode 100644 pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_schema.json diff --git a/pinot-common/src/main/java/org/apache/pinot/common/metadata/ZKMetadataProvider.java b/pinot-common/src/main/java/org/apache/pinot/common/metadata/ZKMetadataProvider.java index 00db262b8d4a..40c0bd98845c 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/metadata/ZKMetadataProvider.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/metadata/ZKMetadataProvider.java @@ -84,6 +84,9 @@ private ZKMetadataProvider() { private static final String PROPERTYSTORE_CLUSTER_CONFIGS_PREFIX = "/CONFIGS/CLUSTER"; private static final String PROPERTYSTORE_SEGMENT_LINEAGE = "/SEGMENT_LINEAGE"; private static final String PROPERTYSTORE_MINION_TASK_METADATA_PREFIX = "/MINION_TASK_METADATA"; + private static final String PROPERTYSTORE_MATERIALIZED_VIEW_DEFINITION_PREFIX = + "/CONFIGS/MATERIALIZED_VIEW/DEFINITION"; + private static final String PROPERTYSTORE_MATERIALIZED_VIEW_RUNTIME_PREFIX = "/CONFIGS/MATERIALIZED_VIEW/RUNTIME"; private static final String PROPERTYSTORE_QUERY_WORKLOAD_CONFIGS_PREFIX = "/CONFIGS/QUERYWORKLOAD"; private static final String PROPERTYSTORE_TASK_LOCK_SUFFIX = "-Lock"; @@ -328,6 +331,23 @@ public static String constructPropertyStorePathForMinionTaskMetadataDeprecated(S return StringUtil.join("/", PROPERTYSTORE_MINION_TASK_METADATA_PREFIX, taskType, tableNameWithType); } + public static String getPropertyStorePathForMaterializedViewDefinitionPrefix() { + return PROPERTYSTORE_MATERIALIZED_VIEW_DEFINITION_PREFIX; + } + + public static String constructPropertyStorePathForMaterializedViewDefinition( + String materializedViewTableNameWithType) { + return StringUtil.join("/", PROPERTYSTORE_MATERIALIZED_VIEW_DEFINITION_PREFIX, materializedViewTableNameWithType); + } + + public static String getPropertyStorePathForMaterializedViewRuntimePrefix() { + return PROPERTYSTORE_MATERIALIZED_VIEW_RUNTIME_PREFIX; + } + + public static String constructPropertyStorePathForMaterializedViewRuntime(String materializedViewTableNameWithType) { + return StringUtil.join("/", PROPERTYSTORE_MATERIALIZED_VIEW_RUNTIME_PREFIX, materializedViewTableNameWithType); + } + public static String constructPropertyStorePathForLogical(String tableName) { return StringUtil.join("/", ZkPaths.LOGICAL_TABLE_PARENT_PATH, tableName); } diff --git a/pinot-controller/pom.xml b/pinot-controller/pom.xml index 1ae7adf62238..3fa8183c99b2 100644 --- a/pinot-controller/pom.xml +++ b/pinot-controller/pom.xml @@ -36,6 +36,10 @@ https://registry.npmjs.org/npm/-/ + + org.apache.pinot + pinot-materialized-view + org.apache.pinot pinot-query-planner diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/BaseControllerStarter.java b/pinot-controller/src/main/java/org/apache/pinot/controller/BaseControllerStarter.java index 51807d8de878..29467121ed5f 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/BaseControllerStarter.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/BaseControllerStarter.java @@ -47,6 +47,7 @@ import org.apache.hc.client5.http.io.HttpClientConnectionManager; import org.apache.hc.core5.http.io.SocketConfig; import org.apache.hc.core5.util.Timeout; +import org.apache.helix.HelixAdmin; import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; @@ -55,9 +56,11 @@ import org.apache.helix.manager.zk.ZKHelixManager; import org.apache.helix.model.ClusterConstraints; import org.apache.helix.model.ConstraintItem; +import org.apache.helix.model.HelixConfigScope; import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.MasterSlaveSMD; import org.apache.helix.model.Message; +import org.apache.helix.model.builder.HelixConfigScopeBuilder; import org.apache.helix.store.zk.ZkHelixPropertyStore; import org.apache.helix.task.TaskDriver; import org.apache.helix.zookeeper.constant.ZkSystemPropertyKeys; @@ -144,6 +147,7 @@ import org.apache.pinot.core.transport.grpc.GrpcQueryServer; import org.apache.pinot.core.util.ListenerConfigUtil; import org.apache.pinot.core.util.trace.ContinuousJfrStarter; +import org.apache.pinot.materializedview.consistency.MaterializedViewConsistencyManager; import org.apache.pinot.segment.local.utils.TableConfigUtils; import org.apache.pinot.segment.spi.partition.PartitionFunctionFactory; import org.apache.pinot.spi.config.instance.InstanceConfigValidatorRegistry; @@ -239,6 +243,7 @@ public abstract class BaseControllerStarter implements ServiceStartable { protected RebalancePreChecker _rebalancePreChecker; protected TableRebalanceManager _tableRebalanceManager; protected DefaultClusterConfigChangeHandler _clusterConfigChangeHandler; + protected MaterializedViewConsistencyManager _materializedViewConsistencyManager; @Override public void init(PinotConfiguration pinotConfiguration) @@ -605,7 +610,35 @@ private void setUpPinotController() { LOGGER.info("Starting Pinot Helix resource manager and connecting to Zookeeper"); _helixResourceManager.start(_helixParticipantManager, _controllerMetrics); - // Initialize segment lifecycle event listeners + // Register MV consistency manager BEFORE any other lifecycle listener initialization. + // PinotHelixResourceManager.notifyMaterializedView* methods are entered as soon as + // _helixResourceManager.start() returns (segment add/delete/replace handlers no-op if + // the manager is null), so we want this to be the very first thing wired up so any + // segment events arriving immediately after Helix participant becomes online will + // correctly trigger STALE marking. + LOGGER.info("Initializing MaterializedView consistency manager"); + _materializedViewConsistencyManager = new MaterializedViewConsistencyManager(); + _materializedViewConsistencyManager.init(_helixResourceManager.getPropertyStore()); + // Wire a live cluster-config reader so caps like the consistency-manager debounce window + // can be overridden via `pinot-admin.sh ClusterConfig` without a controller restart. + final HelixAdmin helixAdminForMv = _helixResourceManager.getHelixAdmin(); + final String helixClusterName = _helixResourceManager.getHelixClusterName(); + _materializedViewConsistencyManager.setClusterConfigReader(configName -> { + try { + HelixConfigScope scope = new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.CLUSTER) + .forCluster(helixClusterName).build(); + Map values = helixAdminForMv.getConfig(scope, Collections.singletonList(configName)); + return values == null ? null : values.get(configName); + } catch (Exception e) { + return null; + } + }); + _helixResourceManager.registerMaterializedViewConsistencyManager(_materializedViewConsistencyManager); + _helixResourceManager.getSegmentDeletionManager() + .registerMaterializedViewConsistencyManager(_materializedViewConsistencyManager); + + // Initialize segment lifecycle event listeners (registered after MV manager so any + // listener that fires immediately on registration sees a fully-wired notify path). PinotSegmentLifecycleEventListenerManager.getInstance().init(_helixParticipantManager); LOGGER.info("Starting task resource manager"); @@ -1153,6 +1186,11 @@ private void stopPinotController() { LOGGER.info("Stopping Jersey admin API"); _adminApp.stop(); + if (_materializedViewConsistencyManager != null) { + LOGGER.info("Stopping MV consistency manager"); + _materializedViewConsistencyManager.stop(); + } + LOGGER.info("Stopping resource manager"); _helixResourceManager.stop(); diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java index a923ec3721a6..3aef26fe390c 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java @@ -161,6 +161,10 @@ import org.apache.pinot.controller.workload.QueryWorkloadManager; import org.apache.pinot.core.util.NumberUtils; import org.apache.pinot.core.util.NumericException; +import org.apache.pinot.materializedview.consistency.MaterializedViewConsistencyManager; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadata; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadataUtils; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadataUtils; import org.apache.pinot.segment.local.utils.TableConfigUtils; import org.apache.pinot.segment.spi.SegmentMetadata; import org.apache.pinot.spi.config.DatabaseConfig; @@ -168,6 +172,7 @@ import org.apache.pinot.spi.config.instance.InstanceConfigValidatorRegistry; import org.apache.pinot.spi.config.table.TableConfig; import org.apache.pinot.spi.config.table.TableStatsHumanReadable; +import org.apache.pinot.spi.config.table.TableTaskConfig; import org.apache.pinot.spi.config.table.TableType; import org.apache.pinot.spi.config.table.TagOverrideConfig; import org.apache.pinot.spi.config.table.TenantConfig; @@ -198,6 +203,7 @@ import org.apache.pinot.spi.utils.builder.TableNameBuilder; import org.apache.pinot.spi.utils.retry.RetryPolicies; import org.apache.pinot.spi.utils.retry.RetryPolicy; +import org.apache.pinot.sql.parsers.CalciteSqlParser; import org.apache.zookeeper.data.Stat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -256,6 +262,7 @@ private enum LineageUpdateType { // renames. The resulting extra session is consistent with the controller's existing footprint // (_propertyStore cache client, _leadControllerManager manager client are each distinct). private volatile ZkClient _zkClient; + private volatile MaterializedViewConsistencyManager _materializedViewConsistencyManager; public PinotHelixResourceManager(String helixClusterName, @Nullable String dataDir, boolean isSingleTenantCluster, boolean enableBatchMessageMode, int deletedSegmentsRetentionInDays, @@ -1967,6 +1974,7 @@ public void addTable(TableConfig tableConfig, List streamMetadat return is; }); _queryWorkloadManager.propagateWorkloadFor(tableNameWithType); + notifyMaterializedViewConsistencyManagerForTableCreate(tableConfig); LOGGER.info("Adding table {}: Successfully added table", tableNameWithType); } @@ -2206,6 +2214,11 @@ public void registerPinotLLCRealtimeSegmentManager(PinotLLCRealtimeSegmentManage _pinotLLCRealtimeSegmentManager = pinotLLCRealtimeSegmentManager; } + public void registerMaterializedViewConsistencyManager( + MaterializedViewConsistencyManager materializedViewConsistencyManager) { + _materializedViewConsistencyManager = materializedViewConsistencyManager; + } + private void assignInstances(TableConfig tableConfig, boolean override) { String tableNameWithType = tableConfig.getTableName(); String rawTableName = TableNameBuilder.extractRawTableName(tableNameWithType); @@ -2505,6 +2518,26 @@ public void deleteTable(String tableName, TableType tableType, @Nullable String String tableNameWithType = TableNameBuilder.forType(tableType).tableNameWithType(tableName); LOGGER.info("Deleting table {}: Start", tableNameWithType); + // Block the delete when materialized views depend on this base table. Orphaning an MV + // leaves its runtime znode pointing at a base that no longer exists; the MV's task + // generator would then fail forever on every cycle, and the broker rewrite engine (PR 2) + // would keep serving the now-orphaned MV partitions as if the base were intact — silent + // stale data. Force the operator to drop dependent MVs first. + MaterializedViewConsistencyManager mvMgr = _materializedViewConsistencyManager; + if (mvMgr != null) { + String rawTableName = TableNameBuilder.extractRawTableName(tableNameWithType); + List dependentMVs = mvMgr.getDependentMaterializedViews(rawTableName); + // Don't block when the table being dropped IS an MV (self-reference impossible, but the + // index entry exists for MVs over MVs — currently unsupported, but the guard is cheap). + dependentMVs.remove(tableNameWithType); + if (!dependentMVs.isEmpty()) { + throw new IllegalStateException(String.format( + "Cannot delete table '%s': %d materialized view(s) depend on it: %s. " + + "Drop the dependent materialized views first.", + tableNameWithType, dependentMVs.size(), dependentMVs)); + } + } + // Remove the table from brokerResource HelixHelper.removeResourceFromBrokerIdealState(_helixZkManager, tableNameWithType); LOGGER.info("Deleting table {}: Removed from broker resource", tableNameWithType); @@ -2560,6 +2593,23 @@ public void deleteTable(String tableName, TableType tableType, @Nullable String MinionTaskMetadataUtils.deleteTaskMetadata(_propertyStore, tableNameWithType); LOGGER.info("Deleting table {}: Removed all minion task metadata", tableNameWithType); + // Remove materialized view metadata (if any) and unregister from consistency manager + notifyMaterializedViewConsistencyManagerForTableDrop(tableNameWithType); + try { + MaterializedViewDefinitionMetadataUtils.delete(_propertyStore, tableNameWithType); + LOGGER.info("Deleting table {}: Removed MV definition metadata", tableNameWithType); + } catch (Exception e) { + LOGGER.debug("Deleting table {}: No MV definition metadata to remove or removal failed", + tableNameWithType, e); + } + try { + MaterializedViewRuntimeMetadataUtils.delete(_propertyStore, tableNameWithType); + LOGGER.info("Deleting table {}: Removed MV runtime metadata", tableNameWithType); + } catch (Exception e) { + LOGGER.debug("Deleting table {}: No MV runtime metadata to remove or removal failed", + tableNameWithType, e); + } + // Remove table config // NOTE: This should always be the last step for deletion to avoid race condition in table re-create ZKMetadataProvider.removeResourceConfigFromPropertyStore(_propertyStore, tableNameWithType); @@ -2832,6 +2882,7 @@ public void addNewSegment(String tableNameWithType, SegmentMetadata segmentMetad LOGGER.info("Added segment: {} of table: {} to property store", segmentName, tableNameWithType); assignSegment(tableConfig, segmentZKMetadata); + // MV notify fires inside createSegmentZkMetadata above; no separate call needed here. } public boolean needTieredSegmentAssignment(TableConfig tableConfig) { @@ -3023,16 +3074,44 @@ public ZNRecord getSegmentMetadataZnRecord(String tableNameWithType, String segm * @return */ public boolean createSegmentZkMetadata(String tableNameWithType, SegmentZKMetadata segmentZKMetadata) { - return ZKMetadataProvider.createSegmentZkMetadata(_propertyStore, tableNameWithType, segmentZKMetadata); + boolean created = + ZKMetadataProvider.createSegmentZkMetadata(_propertyStore, tableNameWithType, segmentZKMetadata); + if (created) { + // Notify the MV consistency manager so any partition of a dependent MV that overlaps the + // new segment's time range is marked STALE. This hook covers the OFFLINE segment-upload + // paths that route through `ZKOperator.processNewSegment` (REST `/segments` upload) and + // the legacy `addNewSegment` helper. + // + // NOTE: the LLC realtime-segment-commit path (`PinotLLCRealtimeSegmentManager + // .persistSegmentZKMetadata`) writes directly via `_propertyStore.set` and does NOT + // route through this method. `MaterializedViewAnalyzer.validateSourceTable` rejects + // REALTIME source tables at create time precisely because of this gap; if/when realtime + // sources are supported in a later PR the LLC commit path must also gain a notify hook. + notifyMaterializedViewConsistencyManager(tableNameWithType, segmentZKMetadata.getStartTimeMs(), + segmentZKMetadata.getEndTimeMs()); + } + return created; } public boolean updateZkMetadata(String tableNameWithType, SegmentZKMetadata segmentZKMetadata, int expectedVersion) { - return ZKMetadataProvider.setSegmentZKMetadata(_propertyStore, tableNameWithType, segmentZKMetadata, + boolean updated = ZKMetadataProvider.setSegmentZKMetadata(_propertyStore, tableNameWithType, segmentZKMetadata, expectedVersion); + if (updated) { + // Segment metadata updates (e.g. refresh, CRC change) may shift the segment's time range + // or replace its content. Notify so MV partitions that overlap pick up STALE. + notifyMaterializedViewConsistencyManager(tableNameWithType, segmentZKMetadata.getStartTimeMs(), + segmentZKMetadata.getEndTimeMs()); + } + return updated; } public boolean updateZkMetadata(String tableNameWithType, SegmentZKMetadata segmentZKMetadata) { - return ZKMetadataProvider.setSegmentZKMetadata(_propertyStore, tableNameWithType, segmentZKMetadata); + boolean updated = ZKMetadataProvider.setSegmentZKMetadata(_propertyStore, tableNameWithType, segmentZKMetadata); + if (updated) { + notifyMaterializedViewConsistencyManager(tableNameWithType, segmentZKMetadata.getStartTimeMs(), + segmentZKMetadata.getEndTimeMs()); + } + return updated; } public boolean removeSegmentZKMetadata(String tableNameWithType, String segmentName) { @@ -4618,6 +4697,8 @@ public void endReplaceSegments(String tableNameWithType, String segmentLineageEn LOGGER.info("endReplaceSegments is successfully processed in {} ms on attempt: {}. (tableNameWithType = {}, " + "segmentLineageEntryId = {})", System.currentTimeMillis() - endReplaceSegmentsTs, attemptCount + 1, tableNameWithType, segmentLineageEntryId); + + notifyMaterializedViewConsistencyManagerForReplace(tableNameWithType, segmentLineageEntryId); } /** @@ -5206,4 +5287,142 @@ public static void main(String[] args) throws Exception { System.out.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(record)); } */ + + // ── MV Consistency Manager helpers ── + + private void notifyMaterializedViewConsistencyManagerForTableCreate(TableConfig tableConfig) { + MaterializedViewConsistencyManager mgr = _materializedViewConsistencyManager; + if (mgr == null) { + return; + } + try { + TableTaskConfig taskConfig = tableConfig.getTaskConfig(); + if (taskConfig == null) { + return; + } + Map materializedViewTaskConfigs = + taskConfig.getConfigsForTaskType(CommonConstants.MaterializedViewTask.TASK_TYPE); + if (materializedViewTaskConfigs == null) { + return; + } + String definedSQL = materializedViewTaskConfigs.get(CommonConstants.MaterializedViewTask.DEFINED_SQL_KEY); + if (definedSQL == null || definedSQL.isEmpty()) { + return; + } + String sourceTable = CalciteSqlParser.compileToPinotQuery(definedSQL).getDataSource().getTableName(); + mgr.onMaterializedViewTableCreated(tableConfig.getTableName(), Collections.singletonList(sourceTable)); + } catch (Exception e) { + LOGGER.warn("Failed to register MV table {} with consistency manager", tableConfig.getTableName(), e); + } + } + + private void notifyMaterializedViewConsistencyManagerForTableDrop(String tableNameWithType) { + MaterializedViewConsistencyManager mgr = _materializedViewConsistencyManager; + if (mgr == null) { + return; + } + try { + MaterializedViewDefinitionMetadata materializedViewDefinition = + MaterializedViewDefinitionMetadataUtils.fetch(_propertyStore, tableNameWithType); + if (materializedViewDefinition != null && materializedViewDefinition.getBaseTables() != null + && !materializedViewDefinition.getBaseTables().isEmpty()) { + mgr.onMaterializedViewTableDropped(tableNameWithType, materializedViewDefinition.getBaseTables()); + return; + } + // Fall back to reading source table from table task config + TableConfig tableConfig = ZKMetadataProvider.getTableConfig(_propertyStore, tableNameWithType); + if (tableConfig == null) { + return; + } + TableTaskConfig taskConfig = tableConfig.getTaskConfig(); + if (taskConfig == null) { + return; + } + Map materializedViewTaskConfigs = + taskConfig.getConfigsForTaskType(CommonConstants.MaterializedViewTask.TASK_TYPE); + if (materializedViewTaskConfigs == null) { + return; + } + String definedSQL = materializedViewTaskConfigs.get(CommonConstants.MaterializedViewTask.DEFINED_SQL_KEY); + if (definedSQL == null || definedSQL.isEmpty()) { + return; + } + String sourceTable = CalciteSqlParser.compileToPinotQuery(definedSQL).getDataSource().getTableName(); + mgr.onMaterializedViewTableDropped(tableNameWithType, Collections.singletonList(sourceTable)); + } catch (Exception e) { + LOGGER.warn("Failed to unregister MV table {} from consistency manager", tableNameWithType, e); + } + } + + private void notifyMaterializedViewConsistencyManager(String tableNameWithType, long startTimeMs, long endTimeMs) { + MaterializedViewConsistencyManager mgr = _materializedViewConsistencyManager; + if (mgr == null) { + return; + } + String rawTableName = TableNameBuilder.extractRawTableName(tableNameWithType); + // Fast-path: skip ALL further work — including any log emission — when no MV depends on + // this base table. This method is called on every controller-side ZK segment-metadata + // write (createSegmentZkMetadata + updateZkMetadata), which on a realtime table fires at + // the per-consuming-segment commit rate. Without this early bail, even a no-op notify + // chain (extract name + WARN log + onBaseTableFullInvalidation → fast-path inside) would + // multiply controller CPU + log volume on every realtime commit. Verified by + // PauselessRealtimeIngestionNewSegmentMetadataCreationFailureTest, which timed out at 100s + // when the per-commit WARN log was unconditionally emitted. + if (mgr.getDependentMaterializedViews(rawTableName).isEmpty()) { + return; + } + if (startTimeMs < 0 || endTimeMs < 0) { + // Consuming/realtime segments and segments built from records with null timestamps + // can carry -1 startTime/endTime in their ZK metadata. Skipping the notification would + // leave any MV defined on top of this base table with stale VALID partitions; we instead + // signal a full-range invalidation so the consistency manager marks every affected + // partition STALE. + LOGGER.warn("Base table {} segment update reports startTimeMs={}, endTimeMs={}; treating " + + "as full-range MV invalidation.", tableNameWithType, startTimeMs, endTimeMs); + mgr.onBaseTableFullInvalidation(rawTableName); + return; + } + mgr.onBaseTableDataChange(rawTableName, startTimeMs, endTimeMs); + } + + private void notifyMaterializedViewConsistencyManagerForReplace(String tableNameWithType, + String segmentLineageEntryId) { + MaterializedViewConsistencyManager mgr = _materializedViewConsistencyManager; + if (mgr == null) { + return; + } + try { + SegmentLineage lineage = SegmentLineageAccessHelper.getSegmentLineage(_propertyStore, tableNameWithType); + if (lineage == null) { + return; + } + LineageEntry entry = lineage.getLineageEntry(segmentLineageEntryId); + if (entry == null) { + return; + } + long minStart = Long.MAX_VALUE; + long maxEnd = Long.MIN_VALUE; + for (String segName : entry.getSegmentsFrom()) { + SegmentZKMetadata meta = getSegmentZKMetadata(tableNameWithType, segName); + if (meta != null) { + minStart = Math.min(minStart, meta.getStartTimeMs()); + maxEnd = Math.max(maxEnd, meta.getEndTimeMs()); + } + } + for (String segName : entry.getSegmentsTo()) { + SegmentZKMetadata meta = getSegmentZKMetadata(tableNameWithType, segName); + if (meta != null) { + minStart = Math.min(minStart, meta.getStartTimeMs()); + maxEnd = Math.max(maxEnd, meta.getEndTimeMs()); + } + } + if (minStart != Long.MAX_VALUE && maxEnd != Long.MIN_VALUE) { + String rawTableName = TableNameBuilder.extractRawTableName(tableNameWithType); + mgr.onBaseTableDataChange(rawTableName, minStart, maxEnd); + } + } catch (Exception e) { + LOGGER.warn("Failed to notify MV consistency manager for segment replace on table: {}", + tableNameWithType, e); + } + } } diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/SegmentDeletionManager.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/SegmentDeletionManager.java index e72ac521cec3..8c4c7c4b484e 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/SegmentDeletionManager.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/SegmentDeletionManager.java @@ -46,11 +46,13 @@ import org.apache.helix.store.zk.ZkHelixPropertyStore; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.apache.pinot.common.metadata.ZKMetadataProvider; +import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; import org.apache.pinot.common.utils.TarCompressionUtils; import org.apache.pinot.common.utils.URIUtils; import org.apache.pinot.controller.LeadControllerManager; import org.apache.pinot.core.segment.processing.lifecycle.PinotSegmentLifecycleEventListenerManager; import org.apache.pinot.core.segment.processing.lifecycle.impl.SegmentDeletionEventDetails; +import org.apache.pinot.materializedview.consistency.MaterializedViewConsistencyManager; import org.apache.pinot.segment.local.utils.SegmentPushUtils; import org.apache.pinot.spi.config.table.SegmentsValidationAndRetentionConfig; import org.apache.pinot.spi.config.table.TableConfig; @@ -93,6 +95,7 @@ public class SegmentDeletionManager { private final HelixAdmin _helixAdmin; private final ZkHelixPropertyStore _propertyStore; private final long _defaultDeletedSegmentsRetentionMs; + private volatile MaterializedViewConsistencyManager _materializedViewConsistencyManager; public SegmentDeletionManager(String dataDir, HelixAdmin helixAdmin, String helixClusterName, ZkHelixPropertyStore propertyStore, int deletedSegmentsRetentionInDays) { @@ -116,6 +119,49 @@ public void stop() { _executorService.shutdownNow(); } + public void registerMaterializedViewConsistencyManager( + MaterializedViewConsistencyManager materializedViewConsistencyManager) { + _materializedViewConsistencyManager = materializedViewConsistencyManager; + } + + private void notifyMaterializedViewConsistencyManager(String tableName, List segmentsToDelete) { + MaterializedViewConsistencyManager mgr = _materializedViewConsistencyManager; + if (mgr == null || segmentsToDelete.isEmpty()) { + return; + } + try { + long minStart = Long.MAX_VALUE; + long maxEnd = Long.MIN_VALUE; + boolean sawSegmentWithoutTime = false; + for (String segmentId : segmentsToDelete) { + SegmentZKMetadata meta = ZKMetadataProvider.getSegmentZKMetadata(_propertyStore, tableName, segmentId); + if (meta != null) { + long startMs = meta.getStartTimeMs(); + long endMs = meta.getEndTimeMs(); + if (startMs >= 0 && endMs >= 0) { + minStart = Math.min(minStart, startMs); + maxEnd = Math.max(maxEnd, endMs); + } else { + sawSegmentWithoutTime = true; + } + } + } + String rawTableName = TableNameBuilder.extractRawTableName(tableName); + if (sawSegmentWithoutTime) { + // At least one deleted segment lacked time-range metadata. To avoid leaking stale + // VALID partitions in any dependent MV, signal a full-range invalidation. The + // consistency manager's BUCKET_MISSING_MARK_CAP bounds blast radius for long-history MVs. + LOGGER.warn("Base table {} segment deletion includes segments without startTime/endTime; " + + "treating as full-range MV invalidation.", tableName); + mgr.onBaseTableFullInvalidation(rawTableName); + } else if (minStart != Long.MAX_VALUE && maxEnd != Long.MIN_VALUE) { + mgr.onBaseTableDataChange(rawTableName, minStart, maxEnd); + } + } catch (Exception e) { + LOGGER.warn("Failed to notify MV consistency manager for segment deletion on table: {}", tableName, e); + } + } + public void deleteSegments(String tableName, Collection segmentIds) { deleteSegments(tableName, segmentIds, (Long) null); } @@ -177,6 +223,9 @@ protected synchronized void deleteSegmentFromPropertyStoreAndLocal(String tableN propStorePathList.add(segmentPropertyStorePath); } + // Capture segment time ranges before ZK metadata is removed (for MV dirty marking) + notifyMaterializedViewConsistencyManager(tableName, segmentsToDelete); + // Notify all active listeners here PinotSegmentLifecycleEventListenerManager.getInstance() .notifyListeners(new SegmentDeletionEventDetails(tableName, segmentsToDelete)); diff --git a/pinot-materialized-view/DESIGN.md b/pinot-materialized-view/DESIGN.md new file mode 100644 index 000000000000..9dcf8c92a580 --- /dev/null +++ b/pinot-materialized-view/DESIGN.md @@ -0,0 +1,233 @@ + + +# Pinot Materialized Views — Design + +This document covers the design of the materialized-view (MV) subsystem and its +intended extension points. It is meant for contributors who are about to extend +this module — particularly for the **fixed-partition MV** work that follows the +initial time-windowed MV implementation. + +## 1. Today's design (PR 1 + PR 2) + +PR 1 ships **time-windowed MVs only**. PR 2 adds broker query rewrite on top of +the same partition model. The runtime contract is: + +- **Definition** (`MaterializedViewDefinitionMetadata`, persisted under + `/CONFIGS/MATERIALIZED_VIEW/DEFINITION/`): + user-supplied SQL, source-table reference, source partition expressions, + `rewriteEnabled`, `stalenessThresholdMs`. Immutable post-create except for + the rewrite flag and SLO knobs. +- **Runtime** (`MaterializedViewRuntimeMetadata`, persisted under + `/CONFIGS/MATERIALIZED_VIEW/RUNTIME/`): + `watermarkMs` plus a `Map` keyed by `bucketStartMs`. + Updated by the minion executor on task completion and by the controller-side + `MaterializedViewConsistencyManager` on base-table segment changes. +- **Partitioning**: uniform-width time buckets defined by `bucketTimePeriod` on + the MV's task config. The MV's designated time column must be either an + identity passthrough of the base time column or a `DATETRUNC` whose unit + matches the bucket width — `TimeExprValidator` enforces both at create time. +- **STALE marking**: range-based. Controller calls + `MaterializedViewConsistencyManager.onBaseTableDataChange(table, startMs, endMs)`, + the manager debounces (5 s) and then maps the range to overlapping buckets + via floor-div arithmetic. +- **Task selection**: priority-ordered. STALE partitions are OVERWRITE'd first + (after fingerprint reconfirmation), then APPEND advances the watermark one + bucket at a time, batched up to `maxTasksPerBatch`. + +Strengths of this model: small constant memory per MV, deterministic bucket +arithmetic, naturally compatible with Pinot's existing time-partitioned segment +layout, and a single immutable-coverage assumption that closes the silent-stale +correctness hole at create time (upsert / dedup / dim / REFRESH base tables are +rejected by the analyzer). + +Limitations of this model — all of which the **fixed-partition** extension +addresses: + +- Real workloads sometimes want a categorical partition key (per-tenant + rollups, per-country aggregates) rather than a time window. +- Some MVs want a single-row full-table aggregate — no partition key at all, + just "refresh every N minutes." +- Range-based STALE marking is useless when the partition key is not derived + from a base column the controller can read from segment metadata. + +## 2. The fixed-partition extension + +### 2.1 Two shapes to support + +| Shape | Description | Example | +| ------------- | ------------------------------------------------------------ | ------------------------------------------------------------------ | +| `TIME` | (today) Time-windowed buckets keyed by `bucketStartMs`. | Per-day-per-carrier flight rollups. | +| `CATEGORICAL` | N fixed buckets keyed by a declared partition column value. | Per-tenant rollups; one bucket per `tenant_id`. | +| `SINGLETON` | Exactly one bucket; the MV is a full-table aggregate. | Top-K customers across the entire warehouse, refreshed every 10 m. | + +`SINGLETON` is a degenerate case of `CATEGORICAL` with one fixed key. Treat +them as one extension point. + +### 2.2 Where the boundary is + +Subsystems that **do not change** (the interfaces here are already +partition-shape neutral; the docstrings now flag this explicitly): + +- `MaterializedViewQueryExecutor` / `GrpcMaterializedViewQueryExecutor`: runs + whatever SQL the scheduler hands it. No knowledge of partition shape. +- `MaterializedViewTaskGeneratorContext`: pure metadata + ZK lookup. +- `PartitionFingerprint` + `PartitionState`: per-partition state machine works + for any key type. + +Subsystems that **land in PR 2** (not present in PR 1): + +- `AggregationEquivalenceRegistry` + the equivalence rules + (`SUM`/`COUNT`/`MIN`/`MAX`/`HLL`/`THETASKETCH`): re-aggregation is + associative-commutative and partition-shape neutral, but the runtime that + applies these rules is the broker rewrite engine (PR 2). PR 1's analyzer + holds a small `SUPPORTED_MATERIALIZED_VIEW_AGGREGATIONS` constant set with + the same function names so create-time validation matches what PR 2's + rewrite will support; the two must stay in sync. + +Subsystems that **change**, ordered from least to most invasive: + +1. **Definition schema**. Add a `partitionKind: TIME | CATEGORICAL | SINGLETON` + discriminator on `MaterializedViewDefinitionMetadata`. `TIME` is the implicit + default for back-compat on existing definitions. For `CATEGORICAL`, also + persist the partition column name and (optionally) a frozen list of expected + partition keys — the freeze is required so the analyzer can validate the + set at create time and the scheduler can enumerate refresh candidates. +2. **Analyzer**. `MaterializedViewAnalyzer` dispatches on `partitionKind`: + - `TIME` (today): require time column + `bucketTimePeriod`; reject + mutable source types. + - `CATEGORICAL`: require a partition column that exists in both the + source schema and the MV schema, with the same type; reject if the + source is not partitioned on the same column (we still allow it, but + record a "scatter-gather refresh" mode that requires the executor to + issue a `WHERE partitionColumn = ?` query per partition). + - `SINGLETON`: no partition column; one task per refresh interval. +3. **Runtime metadata**. `MaterializedViewRuntimeMetadata._partitions` changes + from `Map` to `Map` (the on-disk + wire format is **already** string-keyed; this is an in-memory refactor only, + no on-disk schema change). Introduce a `PartitionKey` value type that wraps + either a `Long bucketStartMs` (TIME) or a `String categoricalKey` + (CATEGORICAL / SINGLETON) — keeps call sites typed. `watermarkMs` becomes + optional, populated only for `TIME` MVs. `MaterializedViewRuntimeMetadata` + gets a `PartitionKind` field so readers know how to interpret the keys. +4. **Consistency manager**. Dispatch on `partitionKind`: + - `TIME`: today's range-to-bucket sweep. + - `CATEGORICAL` / `SINGLETON`: the controller does not know which + categorical partition a base segment touched, so any base change + routes to `onBaseTableFullInvalidation`. Implementation: mark every + partition in the runtime znode STALE. The debounce + retry logic + already in place applies unchanged. + + The public entry point `onBaseTableFullInvalidation(rawTableName)` exists + today (PR 1 shipped it for the time-MV "unknown range" case); fixed-MV + callers route through the same method, so the controller-side notification + code does not need to grow new branches when fixed-MV lands. +5. **Scheduler**. Two selection strategies behind a partition-kind dispatch: + - `TIME` (today): watermark-based APPEND + STALE OVERWRITE prioritization. + - `CATEGORICAL` / `SINGLETON`: FIFO over STALE partitions by + `lastRefreshTime`, plus a "force refresh every N minutes regardless of + STALE" knob driven by `stalenessThresholdMs`. No watermark, no time + arithmetic, no buffer period. APPEND is meaningless for these shapes. + + The SQL the scheduler emits for a CATEGORICAL refresh is: + ` WHERE = ` + — the same text-splice pattern used today for time-range filters, but on + a categorical predicate. +6. **Executor**. The minion executor receives a SQL string and a partition + key; it builds segments, calls `validateSourceFingerprintAtCommit`, and + writes back to runtime metadata. None of that depends on partition shape; + the only change is that `partStartMs` / `partEndMs` task config keys become + "partition key" — kept as separate `partitionKey` task config to preserve + the time-MV path's existing semantics. + +### 2.3 Wire-format compatibility + +Two-step roll-out, no breaking changes: + +1. Land definition `partitionKind` (default `TIME`) and a `PartitionKey` value + type internally. Existing TIME-MVs round-trip identically. +2. Land the CATEGORICAL/SINGLETON code paths behind the new discriminator. + The runtime znode's map-field key remains a string in both shapes; the + reader path branches on `partitionKind` to interpret the string as either a + millisecond timestamp or a categorical key. + +The mixed-version concern (older controllers reading a fixed-partition runtime +znode written by a newer controller) is closed by the `partitionKind` field +defaulting to `TIME` on absence — older controllers that don't know about fixed +partitions will refuse to parse the categorical keys (`NumberFormatException` +on `Long.parseLong(key)`) and the existing `fromZNRecord` forward-compat +catch-and-ignore preserves the rest of the metadata. + +### 2.4 Interface cleanliness — what's already done and what isn't + +| Interface | State today | Action needed when fixed-MV lands | +| --------------------------------------------------- | ------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | +| `MaterializedViewQueryExecutor` | Shape-neutral. | None. | +| `MaterializedViewTaskGeneratorContext` | Shape-neutral (metadata + ZK lookup only). | None. | +| `MaterializedViewConsistencyManager.on…Change` | Range-based; full-invalidation entry point added. | Dispatch on `partitionKind` internally; callers unchanged because they already use `onBaseTableFullInvalidation` | +| | | when they cannot supply a tight range. | +| `MaterializedViewRuntimeMetadata` | `_partitions: Map`. | Generalize to `Map` (or `Map` with a separate `PartitionKind`). | +| | `_watermarkMs: long`. | Make optional; populated only for `TIME`. | +| `PartitionFingerprint` / `PartitionState` | Shape-neutral. | None. | +| `AggregationEquivalenceRegistry` (lands in PR 2) | Shape-neutral (re-agg algebra). | None. PR 1 keeps just a string set in the analyzer for create-time validation. | +| `MaterializedViewAnalyzer.validateSourceTable` | Time-MV only. | Sibling validator for CATEGORICAL: require partition column existence and (optional) frozen key list. | +| `MaterializedViewAnalyzer.validateTaskConfigs` | Hard-requires `bucketTimePeriod`. | Make required-on-`TIME`; optional / absent on CATEGORICAL. | +| `MaterializedViewTaskScheduler.generateTasks` | Watermark + APPEND. | Branch on `partitionKind`; CATEGORICAL goes through FIFO-STALE + refresh-interval logic. | +| `MaterializedViewTaskExecutor.executeTask` | Window-based. | Branch on task-config `partitionKey` shape; segment building is unchanged. | +| `MaterializedViewDefinitionMetadata` ZK wire format | Untyped; can carry extra fields. | Add `partitionKind` + `partitionColumn` keys; defaulted by older readers. | + +### 2.5 Minimum viable fixed-partition PR + +If the next contributor wants to land CATEGORICAL support in one focused PR, +the smallest defensible scope is: + +- `PartitionKind` enum + `MaterializedViewDefinitionMetadata.partitionKind` + field with `TIME` default. +- New analyzer entry point `validateCategoricalSourceTable` that takes a + partition-column name; reuse the existing `validateSourceTable` for the + mutable-source-type guards. +- `MaterializedViewRuntimeMetadata`: switch `_partitions` key from `Long` to + `PartitionKey` (sum type). `_watermarkMs` stays for back-compat; readers + treat it as advisory on non-TIME MVs. +- `MaterializedViewConsistencyManager`: branch on `partitionKind` in + `markPartitionsDirty`; CATEGORICAL marks all partitions STALE in one CAS. + No changes to public API. +- `MaterializedViewTaskScheduler`: new `generateCategoricalTasks` method that + enumerates STALE partitions, builds `WHERE = ` SQL + text, and emits one task per partition (bounded by `maxTasksPerBatch`). +- `MaterializedViewTaskExecutor`: read `partitionKey` from task config and + treat it as opaque; pass-through to runtime metadata update. + +`SINGLETON` falls out of `CATEGORICAL` by treating it as one fixed key — no +extra code path needed once the above lands. + +## 3. Things explicitly out of scope (for this document) + +- Streaming row-by-row consumption of the gRPC response in + `MaterializedViewTaskExecutor`. Tracked separately; orthogonal to partition + shape. +- Broker-side rewrite of the user query against a fixed-partition MV. The + re-aggregation rules in `AggregationEquivalenceRegistry` are + partition-shape-agnostic; what the broker side needs to add is a + partition-key predicate on the rewrite output (so a user query that filters + by `tenant_id` gets routed to the matching MV partition's segments). That + belongs in the broker rewrite PR series, not in the MV ingestion module. +- Cross-shape MVs (e.g. time × tenant): doable by composing + `Map>`, but no concrete user + ask today. Defer until there is. diff --git a/pinot-materialized-view/pom.xml b/pinot-materialized-view/pom.xml new file mode 100644 index 000000000000..404bd85f4ff4 --- /dev/null +++ b/pinot-materialized-view/pom.xml @@ -0,0 +1,90 @@ + + + + 4.0.0 + + pinot + org.apache.pinot + 1.6.0-SNAPSHOT + + pinot-materialized-view + Pinot Materialized View + https://pinot.apache.org/ + + ${basedir}/.. + + + + + org.apache.pinot + pinot-core + + + org.apache.pinot + pinot-common + + + org.apache.pinot + pinot-spi + + + org.apache.pinot + pinot-segment-spi + + + org.apache.helix + helix-core + + + com.google.guava + guava + + + org.apache.commons + commons-collections4 + + + org.apache.commons + commons-lang3 + + + javax.annotation + javax.annotation-api + + + org.slf4j + slf4j-api + + + + + org.testng + testng + test + + + org.mockito + mockito-core + test + + + diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzer.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzer.java new file mode 100644 index 000000000000..9bae3d479ff7 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzer.java @@ -0,0 +1,886 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.analysis; + +import com.google.common.base.Preconditions; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import org.apache.calcite.sql.SqlCall; +import org.apache.calcite.sql.SqlNode; +import org.apache.calcite.sql.SqlSelect; +import org.apache.calcite.sql.util.SqlBasicVisitor; +import org.apache.pinot.common.request.DataSource; +import org.apache.pinot.common.request.Expression; +import org.apache.pinot.common.request.ExpressionType; +import org.apache.pinot.common.request.Function; +import org.apache.pinot.common.request.PinotQuery; +import org.apache.pinot.common.utils.request.RequestUtils; +import org.apache.pinot.materializedview.analysis.timeexpr.TimeExprValidator; +import org.apache.pinot.materializedview.context.MaterializedViewTaskGeneratorContext; +import org.apache.pinot.materializedview.scheduler.MaterializedViewTaskUtils; +import org.apache.pinot.segment.spi.AggregationFunctionType; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.DateTimeFieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; +import org.apache.pinot.spi.utils.TimeUtils; +import org.apache.pinot.spi.utils.builder.TableNameBuilder; +import org.apache.pinot.sql.parsers.CalciteSqlParser; +import org.apache.pinot.sql.parsers.SqlCompilationException; + + +/// Validates a materialized-view (MV) definition end-to-end using Calcite AST parsing. +/// +/// Fail-fast: throws [IllegalStateException] on the first validation error. +/// Validations performed (in order): +/// +/// - SQL syntax and semantic analysis via [CalciteSqlParser] +/// - Source (base) table existence, source-type eligibility (rejects upsert / dedup / +/// dimension / REFRESH-push tables whose mutability breaks the MV's immutable-coverage +/// assumption), and time-column configuration +/// - Source column existence for all identifiers referenced in the query +/// - MV schema column completeness against the SELECT output fields +/// - Aggregation function recognition +/// - Task config parameter validity (bucket period, buffer period, etc.) +/// - MV time column alignment: `segmentsConfig.timeColumnName` exists in the MV +/// schema as a [DateTimeFieldSpec] and is produced by a SELECT expression +/// - MV time column type / SELECT shape: the SELECT expression producing the MV time column +/// must be either an identity passthrough of the base time column or a `DATETRUNC` call +/// whose unit matches `bucketTimePeriod`. Both base and MV time columns must use +/// [DataType#TIMESTAMP] — validated by [TimeExprValidator]. This is intentionally +/// opinionated; format-string inference (`1:DAYS:EPOCH` vs `1:MILLISECONDS:EPOCH`, +/// `SIMPLE_DATE_FORMAT`, etc.) is unsupported. +/// +/// +/// Thread-safety: all methods are stateless and static. +/// +///

Partition model (TIME-WINDOWED ONLY in PR 1)

+/// +/// This analyzer validates time-windowed MVs only: the source must have a time column, the MV +/// must have a designated time column derived from it, and `bucketTimePeriod` is hard-required. +/// A future fixed-partition (categorical) MV will need a sibling analyzer entry point or a +/// `PartitionKind`-discriminated extension of [#validateTaskConfigs] / [#validateSourceTable]; +/// the equivalence registry, gRPC executor, and metadata-provider lookups are already +/// partition-shape neutral and require no changes. See `pinot-materialized-view/DESIGN.md`. +public final class MaterializedViewAnalyzer { + + private MaterializedViewAnalyzer() { + } + + /// Validates the MV definition and returns extracted metadata on success. + /// + /// @param definedSql the user-defined SQL query for the MV + /// @param viewTableConfig the MV table's [TableConfig] + /// @param viewSchema the MV table's [Schema] + /// @param taskConfigs task-type-specific configuration map + /// @param context accessor for looking up source table config/schema + /// @return [AnalysisResult] with extracted source table name and select field names + /// @throws IllegalStateException on the first validation error encountered + public static AnalysisResult analyze(String definedSql, TableConfig viewTableConfig, Schema viewSchema, + Map taskConfigs, MaterializedViewTaskGeneratorContext context) { + + // Step 4 first: cheap config checks that don't require parsing + long bucketMs = validateTaskConfigs(viewTableConfig, taskConfigs, context); + + // Step 1: SQL syntax and Pinot semantic validation + PinotQuery pinotQuery = validateSqlSyntax(definedSql); + + // Step 1a: reject nested SELECT / subqueries. The scheduler's text-based time-range + // splicing (MaterializedViewTaskScheduler#appendTimeRange) attaches the time predicate + // after the FIRST `WHERE`; a subquery would receive the predicate at the wrong query + // level and silently produce wrong task SQL. Flat queries only — fail fast at create + // time so the operator gets an actionable error instead of corrupted task results. + validateNoNestedSelect(definedSql, viewTableConfig.getTableName()); + + // Step 1b: LIMIT is optional. If absent the generator falls back to + // MaterializedViewTask.DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT and the saturation gate fires at that bound. + // If present, it must be strictly positive. + if (pinotQuery.isSetLimit()) { + validateExplicitLimit(pinotQuery, viewTableConfig.getTableName(), definedSql, context); + } else { + // Simulate the generator's auto-injection (trim, semicolon strip, append LIMIT N) and + // verify the appended LIMIT is parseable. Catches trailing line/block comments in + // definedSQL that would otherwise swallow the LIMIT only at task-generation time — + // surfaces the failure at table-create time instead. + validateAutoInjectedLimitParseable(definedSql, viewTableConfig.getTableName(), context); + } + // OFFSET is not meaningful for MV generation (each window is independent) and would + // also corrupt auto-LIMIT injection — Calcite requires LIMIT before OFFSET, so appending + // " LIMIT N" after an existing OFFSET produces invalid syntax. Reject up front. + Preconditions.checkState(!pinotQuery.isSetOffset(), + "MaterializedViewTask definedSQL must not declare OFFSET for MV table '%s'. SQL: %s", + viewTableConfig.getTableName(), definedSql); + + // Step 2: source table existence and time-column checks + String sourceTableName = validateSourceTable(pinotQuery, definedSql, context); + + // Source column existence + validateSourceColumns(pinotQuery, sourceTableName, context); + + // Step 3: MV schema column completeness (including dateTime columns) + Set selectFields = validateMaterializedViewColumns(pinotQuery, viewSchema); + + // Step 5: extract and validate time column transformation mappings. + // We need both the "exprPretty -> materializedViewCol" map (consumed by downstream metadata and by + // Step 6) and a parallel "materializedViewCol -> sourceExpr" map so Step 7 can locate the actual SELECT + // expression producing each MV dateTime column without re-walking the SELECT list. + PartitionExprData partitionExprData = extractPartitionExprData(pinotQuery, viewSchema); + Map partitionExprMaps = partitionExprData.exprStringToMaterializedViewCol(); + + // Step 6: MV time column (segmentsConfig.timeColumnName) must be wired to a SELECT-produced + // dateTime column. Without this guard, a mismatch (e.g. timeColumnName=ts but SELECT only + // produces date_trunc('DAY', ts) AS day) would only surface at task scheduling time via the + // runtime Preconditions in MaterializedViewTaskGenerator#resolveMaterializedViewTimeColumn / + // resolveMaterializedViewTimeFormat. + validateMaterializedViewTimeColumnAlignment(viewTableConfig, viewSchema, partitionExprMaps); + + // Step 7: MV time column shape + type. TIMESTAMP-only contract: both base and MV time + // columns must use DataType.TIMESTAMP, and the SELECT expression producing the MV time + // column must be either an identity passthrough or DATETRUNC(unit, ts) where unit matches + // bucketTimePeriod. Step 6 has already ensured the MV time column exists, is a + // DateTimeFieldSpec, and is in partitionExprMaps, so the lookup below is guaranteed non-null. + validateMaterializedViewTimeColumn(viewTableConfig, viewSchema, sourceTableName, + partitionExprData.materializedViewColToSourceExpr(), context, bucketMs); + + return new AnalysisResult(sourceTableName, selectFields, partitionExprMaps); + } + + /// Extracts the source table name from the SQL's FROM clause using Calcite AST parsing. + /// Unlike regex-based extraction, this handles quoted identifiers, comments, and complex SQL. + /// + /// @param sql the SQL query string + /// @return the source table name + /// @throws IllegalStateException if the SQL cannot be parsed or the table name cannot be extracted + public static String extractSourceTableName(String sql) { + PinotQuery pinotQuery = validateSqlSyntax(sql); + DataSource dataSource = pinotQuery.getDataSource(); + Preconditions.checkState(dataSource != null, "Could not extract data source from SQL: %s", sql); + String tableName = dataSource.getTableName(); + Preconditions.checkState(tableName != null && !tableName.isEmpty(), + "Could not extract source table name from SQL: %s", sql); + return tableName; + } + + // --------------------------------------------------------------------------- + // Step 1 — SQL syntax + // --------------------------------------------------------------------------- + + private static PinotQuery validateSqlSyntax(String definedSql) { + Preconditions.checkState(definedSql != null && !definedSql.isEmpty(), "definedSQL must be specified"); + try { + return CalciteSqlParser.compileToPinotQuery(definedSql); + } catch (SqlCompilationException e) { + throw new IllegalStateException("Invalid SQL syntax: " + e.getMessage(), e); + } + } + + /// Rejects MV `definedSQL` that contains a nested `SELECT` / subquery. + /// + /// The scheduler's text-based time-range splicing in + /// `MaterializedViewTaskScheduler#appendTimeRange` attaches the time predicate after the + /// FIRST `WHERE`; a subquery would receive the predicate at the inner query level instead + /// of the outer, producing semantically-wrong task SQL. We use Calcite's `SqlNode` AST + /// walk to count `SqlSelect` nodes; the outer query itself is one `SqlSelect`, so any value + /// greater than 1 indicates a nested SELECT. + private static void validateNoNestedSelect(String definedSql, String viewTableName) { + SqlNode sqlNode; + try { + sqlNode = CalciteSqlParser.compileToSqlNodeAndOptions(definedSql).getSqlNode(); + } catch (SqlCompilationException e) { + throw new IllegalStateException("Invalid SQL syntax: " + e.getMessage(), e); + } + int[] selectCount = new int[]{0}; + sqlNode.accept(new SqlBasicVisitor() { + @Override + public Void visit(SqlCall call) { + if (call instanceof SqlSelect) { + selectCount[0]++; + } + return super.visit(call); + } + }); + Preconditions.checkState(selectCount[0] <= 1, + "MV definedSQL for table '%s' must not contain a nested SELECT / subquery. SQL: %s", + viewTableName, definedSql); + } + + // --------------------------------------------------------------------------- + // Step 1b — LIMIT validation (AST-based) + // --------------------------------------------------------------------------- + + /// Validates a LIMIT clause that is present in the `definedSQL`. + /// + /// When LIMIT is absent the generator falls back to + /// [MaterializedViewTask#DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT] and the executor's saturation gate + /// fires at that bound. When LIMIT is present it must be strictly positive. + /// + /// @throws IllegalStateException if the present LIMIT is non-positive + private static void validateExplicitLimit(PinotQuery pinotQuery, String viewTableName, String definedSql, + MaterializedViewTaskGeneratorContext context) { + int limit = pinotQuery.getLimit(); + Preconditions.checkState(limit > 0, + "MaterializedViewTask definedSQL LIMIT must be strictly positive (got %s) for MV table '%s'. SQL: %s", + limit, viewTableName, definedSql); + int maxLimit = MaterializedViewTaskUtils.readPositiveIntClusterConfigOrDefault( + context::getClusterConfig, + MaterializedViewTask.CLUSTER_CONFIG_KEY_MAX_QUERY_LIMIT, + MaterializedViewTask.MAX_MATERIALIZED_VIEW_QUERY_LIMIT); + Preconditions.checkState(limit <= maxLimit, + "MaterializedViewTask definedSQL LIMIT %s exceeds maximum %s for MV table '%s'. " + + "Narrow bucketTimePeriod or filters so per-window row count stays under the cap. SQL: %s", + limit, maxLimit, viewTableName, definedSql); + } + + /// For the no-LIMIT branch, simulate the generator's trailing-text auto-injection (trim + /// trailing semicolon, append " LIMIT N") and verify the LIMIT survives re-parse. Surfaces + /// the common trailing-comment hazard at table-create time instead of letting task generation + /// fail forever in production. + /// + /// Scope: this probe only exercises the trailing-text injection. Other SQL-text hazards + /// (e.g. `appendTimeRange`'s WHERE-clause splice misbehaving on string literals that + /// contain SQL keywords) are caught by the generator's unconditional verify-re-parse just + /// before the task is submitted. + private static void validateAutoInjectedLimitParseable(String definedSql, String viewTableName, + MaterializedViewTaskGeneratorContext context) { + String trimmed = definedSql.trim(); + if (trimmed.endsWith(";")) { + trimmed = trimmed.substring(0, trimmed.length() - 1).trim(); + } + int probeLimit = MaterializedViewTaskUtils.readPositiveIntClusterConfigOrDefault( + context::getClusterConfig, + MaterializedViewTask.CLUSTER_CONFIG_KEY_DEFAULT_QUERY_LIMIT, + MaterializedViewTask.DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT); + String probed = trimmed + " LIMIT " + probeLimit; + Optional verified; + try { + verified = tryExtractDeclaredLimit(probed); + } catch (IllegalStateException e) { + throw new IllegalStateException("MV table '" + viewTableName + "' definedSQL becomes " + + "unparseable when the auto-injected LIMIT is appended (likely a trailing comment " + + "or unbalanced quote). Add an explicit LIMIT to definedSQL or remove the trailing " + + "text. SQL: " + definedSql, e); + } + Preconditions.checkState(verified.isPresent() && verified.get() == probeLimit, + "MV table '%s' definedSQL has trailing text (line/block comment, etc.) that would " + + "swallow the auto-injected LIMIT — broker would silently truncate. Add an " + + "explicit LIMIT to definedSQL or remove the trailing text. SQL: %s", + viewTableName, definedSql); + } + + /// Returns the declared `LIMIT` value from `definedSQL`, or [Optional#empty()] + /// if no LIMIT clause is present. The generator uses the absence sentinel to substitute + /// [MaterializedViewTask#DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT]; the value flows to both the broker + /// SQL (overriding the broker's own default) and the executor's saturation gate. + public static Optional tryExtractDeclaredLimit(String definedSql) { + PinotQuery pinotQuery = validateSqlSyntax(definedSql); + return pinotQuery.isSetLimit() ? Optional.of(pinotQuery.getLimit()) : Optional.empty(); + } + + // --------------------------------------------------------------------------- + // Step 2 — Source table + // --------------------------------------------------------------------------- + + private static String validateSourceTable(PinotQuery pinotQuery, String definedSql, + MaterializedViewTaskGeneratorContext context) { + DataSource dataSource = pinotQuery.getDataSource(); + Preconditions.checkState(dataSource != null, "Could not extract data source from SQL: %s", definedSql); + + String sourceTableName = dataSource.getTableName(); + Preconditions.checkState(sourceTableName != null && !sourceTableName.isEmpty(), + "Could not extract source table name from SQL: %s", definedSql); + + String sourceTableWithType = resolveSourceTableWithType(sourceTableName, context); + + TableConfig sourceTableConfig = context.getTableConfig(sourceTableWithType); + + // Reject source-table types whose physical contents are mutable in ways that violate the MV's + // immutable-coverage assumption. Once a time partition is marked VALID, MV results are served + // as the truth for that interval; if the base table can later change rows in that interval + // (upsert / dedup), be entirely replaced (REFRESH push), or be re-broadcast wholesale (dim + // table), the MV will silently disagree with the base. Catching these at create/update time + // keeps the bad config out of cluster metadata altogether. + Preconditions.checkState(!sourceTableConfig.isUpsertEnabled(), + "Source table '%s' has upsert enabled (mode=%s). Materialized views over upsert tables " + + "are not supported: out-of-order or late-arriving updates can modify rows in time " + + "partitions the MV has already marked VALID, leading to silently stale MV results " + + "that the rewriter would still serve.", + sourceTableName, sourceTableConfig.getUpsertMode()); + Preconditions.checkState(!sourceTableConfig.isDedupEnabled(), + "Source table '%s' has dedup enabled. Materialized views over dedup tables are not " + + "supported: the deduplicated view is server-managed and not stable across segment " + + "reloads / TTLs, so MV-side aggregates cannot be guaranteed to match the base.", + sourceTableName); + Preconditions.checkState(!sourceTableConfig.isDimTable(), + "Source table '%s' is a dimension table. Materialized views over dimension tables are " + + "not supported: dim tables are fully replaced on every refresh and have no notion " + + "of monotonically advancing time, so the MV's coverage model does not apply.", + sourceTableName); + String pushType = resolveSegmentPushType(sourceTableConfig); + Preconditions.checkState(!"REFRESH".equalsIgnoreCase(pushType), + "Source table '%s' uses REFRESH push type. Materialized views over REFRESH-push tables " + + "are not supported: each push wholesale replaces the base segments, so any time " + + "partition the MV has already marked VALID can be invalidated by the next push.", + sourceTableName); + + // Reject REALTIME source tables: the controller-side notify path that drives STALE marking + // (PinotHelixResourceManager.notifyMaterializedViewConsistencyManager) is wired into the + // OFFLINE upload / completed-segment path only; LLC realtime segment commits go through + // PinotLLCRealtimeSegmentManager which does not currently notify the consistency manager. + // Accepting a realtime source today would silently miss STALE marks, leaving the MV + // permanently VALID over windows whose realtime base has continued to grow. Realtime + // support will land alongside the broker rewrite path in a follow-up PR. The upsert/dedup + // guards above already cover most realtime tables; this check catches the remaining plain + // LLC-realtime cases. + Preconditions.checkState( + !TableNameBuilder.isRealtimeTableResource(sourceTableWithType), + "Source table '%s' is a REALTIME table. Materialized views over REALTIME tables are " + + "not yet supported: realtime segment commits do not currently notify the MV " + + "consistency manager, so STALE marking would be silently missed. Use an OFFLINE " + + "source table for now.", + sourceTableName); + + String timeColumn = sourceTableConfig.getValidationConfig().getTimeColumnName(); + Preconditions.checkState(timeColumn != null && !timeColumn.isEmpty(), + "Source table '%s' has no time column configured", sourceTableName); + + Schema sourceSchema = context.getTableSchema(sourceTableWithType); + Preconditions.checkState(sourceSchema != null, "Schema not found for source table: %s", sourceTableName); + + DateTimeFieldSpec fieldSpec = sourceSchema.getSpecForTimeColumn(timeColumn); + Preconditions.checkState(fieldSpec != null, + "No DateTimeFieldSpec found for time column '%s' in source table '%s'", timeColumn, sourceTableName); + + return sourceTableName; + } + + /// Resolves the segment push type from either the modern `IngestionConfig.batchIngestionConfig` + /// location or the legacy `SegmentsValidationAndRetentionConfig.segmentPushType` field. + /// Returns `null` if neither is set, in which case the default behavior (APPEND) is assumed. + /// Both locations must be checked so REFRESH-push tables created with older table configs cannot + /// silently slip past the MV source-type guard. + @SuppressWarnings("deprecation") + private static String resolveSegmentPushType(TableConfig sourceTableConfig) { + if (sourceTableConfig.getIngestionConfig() != null + && sourceTableConfig.getIngestionConfig().getBatchIngestionConfig() != null) { + String type = sourceTableConfig.getIngestionConfig().getBatchIngestionConfig().getSegmentIngestionType(); + if (type != null && !type.isEmpty()) { + return type; + } + } + return sourceTableConfig.getValidationConfig().getSegmentPushType(); + } + + /// Resolves the full table name with type suffix. Tries OFFLINE first, then REALTIME. + private static String resolveSourceTableWithType(String rawSourceTableName, + MaterializedViewTaskGeneratorContext context) { + String offlineName = TableNameBuilder.OFFLINE.tableNameWithType(rawSourceTableName); + if (context.tableExists(offlineName)) { + return offlineName; + } + String realtimeName = TableNameBuilder.REALTIME.tableNameWithType(rawSourceTableName); + Preconditions.checkState(context.tableExists(realtimeName), + "Source table '%s' does not exist (tried OFFLINE and REALTIME)", rawSourceTableName); + return realtimeName; + } + + // --------------------------------------------------------------------------- + // Source column existence + // --------------------------------------------------------------------------- + + private static void validateSourceColumns(PinotQuery pinotQuery, String sourceTableName, + MaterializedViewTaskGeneratorContext context) { + String sourceTableWithType = resolveSourceTableWithType(sourceTableName, context); + Schema sourceSchema = context.getTableSchema(sourceTableWithType); + Preconditions.checkState(sourceSchema != null, "Schema not found for source table: %s", sourceTableName); + + Set sourceColumns = new HashSet<>(sourceSchema.getColumnNames()); + Set referencedIdentifiers = new HashSet<>(); + for (Expression expr : pinotQuery.getSelectList()) { + collectIdentifiers(expr, referencedIdentifiers); + } + if (pinotQuery.getGroupByList() != null) { + for (Expression expr : pinotQuery.getGroupByList()) { + collectIdentifiers(expr, referencedIdentifiers); + } + } + + for (String identifier : referencedIdentifiers) { + Preconditions.checkState(sourceColumns.contains(identifier), + "Column '%s' referenced in SQL does not exist in source table '%s'. Available columns: %s", + identifier, sourceTableName, sourceColumns); + } + } + + // --------------------------------------------------------------------------- + // Step 3 — MV schema columns + // --------------------------------------------------------------------------- + + private static Set validateMaterializedViewColumns(PinotQuery pinotQuery, Schema viewSchema) { + List selectList = pinotQuery.getSelectList(); + Preconditions.checkState(selectList != null && !selectList.isEmpty(), "SELECT list is empty"); + + Set selectFields = new HashSet<>(); + for (Expression expr : selectList) { + String fieldName = extractOutputFieldName(expr); + selectFields.add(fieldName); + } + + // All MV schema columns (including dateTime columns) must be covered by SELECT + Set schemaColumns = new HashSet<>(viewSchema.getColumnNames()); + + // Check 1: every MV schema column must be covered by a SELECT field + for (String col : schemaColumns) { + Preconditions.checkState(selectFields.contains(col), + "MV schema column '%s' is not produced by any SELECT expression. SELECT fields: %s", col, selectFields); + } + + // Check 2: every SELECT field must map to an MV schema column + for (String field : selectFields) { + Preconditions.checkState(schemaColumns.contains(field), + "SELECT field '%s' does not match any column in the MV table schema. Schema columns: %s", + field, schemaColumns); + } + + // Check 3: aggregation function validity + for (Expression expr : selectList) { + validateAggregationFunctions(expr); + } + + return selectFields; + } + + /// Extracts the output field name from a SELECT expression: + /// + /// - Alias expressions (`expr AS alias`): returns the alias name + /// - Bare identifiers (`columnName`): returns the column name + /// - Aggregate/function without alias: throws + /// + private static String extractOutputFieldName(Expression expr) { + Function func = expr.getFunctionCall(); + if (func != null) { + if (func.getOperator().equals("as")) { + Expression aliasExpr = func.getOperands().get(1); + Preconditions.checkState(aliasExpr.getType() == ExpressionType.IDENTIFIER, + "AS alias must be an identifier, got: %s", RequestUtils.prettyPrint(aliasExpr)); + return aliasExpr.getIdentifier().getName(); + } + throw new IllegalStateException( + "Expression '" + RequestUtils.prettyPrint(expr) + + "' must have an AS alias to map to an MV schema column"); + } + if (expr.getType() == ExpressionType.IDENTIFIER) { + return expr.getIdentifier().getName(); + } + throw new IllegalStateException( + "Unsupported expression type in SELECT list: " + RequestUtils.prettyPrint(expr)); + } + + /// MV-side aggregation functions the broker rewrite engine (lands in PR 2) will know how to + /// re-aggregate. Mirrored from the equivalence registry that ships alongside the rewrite + /// engine; kept here as a static set so PR 1's analyzer can reject misconfigured MVs at + /// create time without dragging the rewrite-engine internals into the ingestion module. + /// When PR 2 lands its full {@code AggregationEquivalenceRegistry}, this set MUST stay in + /// sync — adding a function to the registry without adding it here would silently let a + /// usable MV definition slip through create-time validation (still fine in practice, just + /// slightly less helpful as a config-error message). + private static final Set SUPPORTED_MATERIALIZED_VIEW_AGGREGATIONS = Set.of( + "SUM", "MIN", "MAX", "COUNT", + "DISTINCTCOUNTRAWHLL", "DISTINCTCOUNTRAWHLLPLUS", "DISTINCTCOUNTRAWTHETASKETCH"); + + /// Recursively validates that all aggregation functions used are recognized by Pinot AND + /// can be re-aggregated by the broker rewrite engine (PR 2). An MV defined with an + /// aggregation that the rewrite engine cannot use would silently never produce rewrites; + /// surfacing the rejection at create/update time gives the operator a clear error. + private static void validateAggregationFunctions(Expression expr) { + Function func = expr.getFunctionCall(); + if (func == null) { + return; + } + String operator = func.getOperator(); + if (!operator.equals("as") && AggregationFunctionType.isAggregationFunction(operator)) { + // Known aggregation — verify the rewrite engine knows how to re-aggregate it. Without + // re-aggregation support, MaterializedViewQueryRewriteEngine (PR 2) would never select + // this MV (the strategy returns null on the projection-subsumption check), and the + // operator would observe an MV that is configured but never hit. + Preconditions.checkState( + SUPPORTED_MATERIALIZED_VIEW_AGGREGATIONS.contains(operator.toUpperCase(Locale.ROOT)), + "MV definedSQL uses aggregation '%s' for which no MV-side re-aggregation is supported. " + + "Supported MV-side aggregations: %s. Replace '%s' with a supported aggregation.", + operator, SUPPORTED_MATERIALIZED_VIEW_AGGREGATIONS, operator); + } else if (!operator.equals("as") && isLikelyAggregation(func)) { + throw new IllegalStateException( + "Aggregation function '" + operator + "' is not a recognized Pinot aggregation function"); + } + if (func.getOperands() != null) { + for (Expression operand : func.getOperands()) { + validateAggregationFunctions(operand); + } + } + } + + /// Heuristic: a function call whose name doesn't match any known scalar/transform and appears + /// without a GROUP BY context is likely an unrecognized aggregation. For now we only flag + /// functions that CalciteSqlParser itself tagged as aggregation-like but are not in the enum. + private static boolean isLikelyAggregation(Function func) { + return CalciteSqlParser.isAggregateExpression(wrapAsExpression(func)); + } + + private static Expression wrapAsExpression(Function func) { + Expression expr = new Expression(ExpressionType.FUNCTION); + expr.setFunctionCall(func); + return expr; + } + + // --------------------------------------------------------------------------- + // Step 4 — Task config parameters + // --------------------------------------------------------------------------- + + private static long validateTaskConfigs(TableConfig viewTableConfig, Map taskConfigs, + MaterializedViewTaskGeneratorContext context) { + Preconditions.checkState(viewTableConfig.getTableType() == TableType.OFFLINE, + "MaterializedViewTask only supports OFFLINE tables, got: %s", viewTableConfig.getTableType()); + + // bucketTimePeriod is REQUIRED. The consistency manager and broker routing both need an + // authoritative bucket size; falling back to an implicit default risks silent drift when the + // operator's intent doesn't match the default. Reject MV table creation at the controller + // (and the scheduler, since this runs there too) when the field is absent or invalid. + String bucketPeriod = taskConfigs.get(MaterializedViewTask.BUCKET_TIME_PERIOD_KEY); + Preconditions.checkState(bucketPeriod != null && !bucketPeriod.isEmpty(), + "MaterializedViewTask requires '%s' to be set on the MV table's task config", + MaterializedViewTask.BUCKET_TIME_PERIOD_KEY); + long bucketMs; + try { + bucketMs = TimeUtils.convertPeriodToMillis(bucketPeriod); + Preconditions.checkState(bucketMs > 0, "bucketTimePeriod must be positive, got: %s", bucketPeriod); + } catch (Exception e) { + throw new IllegalStateException("Invalid bucketTimePeriod '" + bucketPeriod + "': " + e.getMessage(), e); + } + + String bufferPeriod = taskConfigs.get(MaterializedViewTask.BUFFER_TIME_PERIOD_KEY); + if (bufferPeriod != null && !bufferPeriod.isEmpty()) { + long bufferMs; + try { + bufferMs = TimeUtils.convertPeriodToMillis(bufferPeriod); + } catch (Exception e) { + throw new IllegalStateException("Invalid bufferTimePeriod '" + bufferPeriod + "': " + e.getMessage(), e); + } + Preconditions.checkState(bufferMs >= 0, + "bufferTimePeriod must be non-negative, got: %s", bufferPeriod); + } + + String maxRecords = taskConfigs.get(MaterializedViewTask.MAX_NUM_RECORDS_PER_SEGMENT_KEY); + if (maxRecords != null && !maxRecords.isEmpty()) { + try { + int value = Integer.parseInt(maxRecords); + Preconditions.checkState(value > 0, "maxNumRecordsPerSegment must be positive, got: %d", value); + } catch (NumberFormatException e) { + throw new IllegalStateException( + "Invalid maxNumRecordsPerSegment '" + maxRecords + "': must be a positive integer", e); + } + } + + String maxTasksPerBatch = taskConfigs.get(MaterializedViewTask.MAX_TASKS_PER_BATCH_KEY); + if (maxTasksPerBatch != null && !maxTasksPerBatch.isEmpty()) { + try { + int value = Integer.parseInt(maxTasksPerBatch); + Preconditions.checkState(value >= 1, "maxTasksPerBatch must be >= 1, got: %s", value); + int maxTasksPerBatchCap = MaterializedViewTaskUtils.readPositiveIntClusterConfigOrDefault( + context::getClusterConfig, + MaterializedViewTask.CLUSTER_CONFIG_KEY_MAX_TASKS_PER_BATCH_CAP, + MaterializedViewTask.MAX_TASKS_PER_BATCH_USER_CAP); + Preconditions.checkState(value <= maxTasksPerBatchCap, + "maxTasksPerBatch %s exceeds hard cap %s", + value, maxTasksPerBatchCap); + } catch (NumberFormatException e) { + throw new IllegalStateException( + "Invalid maxTasksPerBatch '" + maxTasksPerBatch + "': must be a positive integer", e); + } + } + return bucketMs; + } + + // --------------------------------------------------------------------------- + // Step 5 — Time column transformation mappings (partitionExprMaps) + // --------------------------------------------------------------------------- + + /// Extracts the mapping from base-table time column expressions to MV dateTime column names. + /// + /// For each dateTime column in the MV schema, this method finds the corresponding SELECT + /// expression and records the transformation. The expression is the base-table side (e.g., + /// `dateTimeConvert(ts, '1:MILLISECONDS:EPOCH', '1:DAYS:EPOCH', '1:DAYS')`) and the + /// value is the MV column identifier (e.g., `materializedViewDay`). + /// + /// If the query has a GROUP BY clause, this method also validates that each dateTime + /// expression appears in the GROUP BY list. + /// + /// @return map from expression string to MV column name + static Map extractPartitionExprMaps(PinotQuery pinotQuery, Schema viewSchema) { + return extractPartitionExprData(pinotQuery, viewSchema).exprStringToMaterializedViewCol(); + } + + /// Internal variant of [Schema)][#extractPartitionExprMaps(PinotQuery,] that also + /// returns the `materializedViewColName -> sourceExpression` mapping. Step 7 needs the live + /// [Expression] (not just the pretty-printed form) so it can run the time-expression + /// inferrer. + /// + /// Both maps are produced in a single SELECT-list walk to avoid double-traversal. + static PartitionExprData extractPartitionExprData(PinotQuery pinotQuery, Schema viewSchema) { + List dateTimeNamesList = viewSchema.getDateTimeNames(); + if (dateTimeNamesList.isEmpty()) { + return PartitionExprData.EMPTY; + } + + Set dateTimeNames = new HashSet<>(dateTimeNamesList); + List selectList = pinotQuery.getSelectList(); + Map partitionExprMaps = new HashMap<>(); + Map materializedViewColToSourceExpr = new HashMap<>(); + + for (Expression expr : selectList) { + String outputName = extractOutputFieldName(expr); + if (!dateTimeNames.contains(outputName)) { + continue; + } + Expression sourceExpr = extractSourceExpression(expr); + String exprString = RequestUtils.prettyPrint(sourceExpr); + partitionExprMaps.put(exprString, outputName); + materializedViewColToSourceExpr.put(outputName, sourceExpr); + } + + Preconditions.checkState(partitionExprMaps.size() == dateTimeNames.size(), + "Not all MV dateTime columns are covered by SELECT expressions. " + + "Expected dateTime columns: %s, found mappings: %s", dateTimeNames, partitionExprMaps); + + // If GROUP BY exists, verify that each dateTime expression is present in GROUP BY + List groupByList = pinotQuery.getGroupByList(); + if (groupByList != null && !groupByList.isEmpty()) { + Set groupByExprStrings = new HashSet<>(); + for (Expression gbExpr : groupByList) { + groupByExprStrings.add(RequestUtils.prettyPrint(gbExpr)); + } + for (Map.Entry entry : partitionExprMaps.entrySet()) { + Preconditions.checkState(groupByExprStrings.contains(entry.getKey()), + "Time column expression '%s' (mapped to MV column '%s') must appear in GROUP BY " + + "when a GROUP BY clause is present. Current GROUP BY: %s", + entry.getKey(), entry.getValue(), groupByExprStrings); + } + } + + return new PartitionExprData(partitionExprMaps, materializedViewColToSourceExpr); + } + + /// Extracts the source expression from a SELECT item, stripping any AS alias wrapper. + private static Expression extractSourceExpression(Expression expr) { + Function func = expr.getFunctionCall(); + if (func != null && func.getOperator().equals("as")) { + return func.getOperands().get(0); + } + return expr; + } + + /// Convenience overload that parses the SQL and extracts partition expression maps + /// without running full validation. Used by the task generator during cold-start. + public static Map extractPartitionExprMaps(String definedSql, Schema viewSchema) { + PinotQuery pinotQuery = validateSqlSyntax(definedSql); + return extractPartitionExprMaps(pinotQuery, viewSchema); + } + + // --------------------------------------------------------------------------- + // Step 6 — MV time column alignment + // --------------------------------------------------------------------------- + + /// Verifies at create/update time that the MV's `segmentsConfig.timeColumnName` + /// is actually produced by the `definedSql` and is a valid dateTime column in the + /// MV schema. + /// + /// Without this guard, a misconfiguration (e.g. `timeColumnName` inherited from + /// the base table as `ts`, while SELECT only produces + /// `date_trunc('DAY', ts) AS day`) would only surface when the minion schedules a + /// task — the runtime `Preconditions` in + /// [MaterializedViewTaskGenerator]`#resolveMaterializedViewTimeColumn` / + /// `#resolveMaterializedViewTimeFormat` would then throw, failing the task instead of the + /// table configuration. + /// + /// Enforced invariants: + /// + /// - `timeColumnName` is set + /// - It exists in the MV schema + /// - It is registered as a [DateTimeFieldSpec] (not a dimension / metric) + /// - It is produced by some SELECT expression (present in `partitionExprMaps`'s + /// values) — i.e. physically present in the MV + /// + /// + /// The stricter "format/granularity also match what the SELECT expression actually + /// produces" check is performed by Step 7 ([#validateMaterializedViewTimeColumnFormat]), which + /// relies on the MV time column already passing invariants (1)–(4) here. + private static void validateMaterializedViewTimeColumnAlignment(TableConfig viewTableConfig, Schema viewSchema, + Map partitionExprMaps) { + String viewTimeColumn = viewTableConfig.getValidationConfig().getTimeColumnName(); + + Preconditions.checkState(viewTimeColumn != null && !viewTimeColumn.isEmpty(), + "MV table segmentsConfig.timeColumnName must be set (required for incremental refresh " + + "and split-mode query rewrite)."); + + Preconditions.checkState(viewSchema.getColumnNames().contains(viewTimeColumn), + "MV time column '%s' does not exist in MV schema. Schema columns: %s", + viewTimeColumn, viewSchema.getColumnNames()); + + DateTimeFieldSpec fieldSpec = viewSchema.getSpecForTimeColumn(viewTimeColumn); + Preconditions.checkState(fieldSpec != null, + "MV time column '%s' is declared in segmentsConfig but is not a dateTime field in the MV " + + "schema. Register it under dateTimeFieldSpecs with an explicit format.", viewTimeColumn); + + Preconditions.checkState(partitionExprMaps.containsValue(viewTimeColumn), + "MV time column '%s' is not produced by any SELECT expression in definedSql. " + + "The MV will not contain this column physically. " + + "Either change segmentsConfig.timeColumnName to one of the time columns the " + + "definedSql produces (candidates: %s), or add a SELECT alias that produces '%s'.", + viewTimeColumn, + partitionExprMaps.values().isEmpty() ? "" : partitionExprMaps.values(), + viewTimeColumn); + } + + // --------------------------------------------------------------------------- + // Step 7 — MV time column type + SELECT shape (TIMESTAMP-only) + // --------------------------------------------------------------------------- + + /// Strict TIMESTAMP-only validation of the MV time column. Both the base and MV time columns + /// must use [DataType#TIMESTAMP] (epoch millis), and the SELECT expression producing the MV + /// time column must be either an identity passthrough or `DATETRUNC(, baseTimeCol)` + /// where `` matches `bucketTimePeriod`. Everything else (format inference, + /// `dateTimeConvert`, `toDateTime`, SIMPLE_DATE_FORMAT, non-millis units) is rejected at + /// create time. + /// + /// Preconditions established by Steps 5–6: `materializedViewTimeCol` is non-empty, exists in + /// the MV schema as a [DateTimeFieldSpec], and `materializedViewColToSourceExpr` contains an + /// entry for it. Therefore both lookups below are guaranteed non-null. + private static void validateMaterializedViewTimeColumn(TableConfig viewTableConfig, Schema viewSchema, + String sourceTableName, Map materializedViewColToSourceExpr, + MaterializedViewTaskGeneratorContext context, long bucketMs) { + + String materializedViewTimeCol = viewTableConfig.getValidationConfig().getTimeColumnName(); + Expression sourceExpr = materializedViewColToSourceExpr.get(materializedViewTimeCol); + DateTimeFieldSpec materializedViewFieldSpec = viewSchema.getSpecForTimeColumn(materializedViewTimeCol); + Preconditions.checkState(sourceExpr != null, + "MV time column '%s' is declared in segmentsConfig.timeColumnName but is not produced " + + "by any SELECT expression in definedSQL. Check your SELECT list aliases.", materializedViewTimeCol); + Preconditions.checkState(materializedViewFieldSpec != null, + "MV time column '%s' has no DateTimeFieldSpec in the MV schema. " + + "Ensure the schema declares this column as a DateTime field.", materializedViewTimeCol); + + String sourceTableWithType = resolveSourceTableWithType(sourceTableName, context); + TableConfig sourceTableConfig = context.getTableConfig(sourceTableWithType); + String baseTimeColumn = sourceTableConfig.getValidationConfig().getTimeColumnName(); + Schema sourceSchema = context.getTableSchema(sourceTableWithType); + DateTimeFieldSpec baseFieldSpec = sourceSchema.getSpecForTimeColumn(baseTimeColumn); + Preconditions.checkState(baseFieldSpec != null, + "Internal error: base table '%s' time column '%s' resolved to null DateTimeFieldSpec at " + + "format-validation step.", sourceTableName, baseTimeColumn); + + TimeExprValidator.validate(sourceExpr, baseTimeColumn, baseFieldSpec, + materializedViewTimeCol, materializedViewFieldSpec, bucketMs); + } + + /// Step-5 output: both the `exprPretty -> materializedViewCol` map (consumed by downstream + /// metadata + Step 6) and the `materializedViewCol -> sourceExpr` map (consumed by Step 7). + static final class PartitionExprData { + static final PartitionExprData EMPTY = + new PartitionExprData(Collections.emptyMap(), Collections.emptyMap()); + + private final Map _exprStringToMaterializedViewCol; + private final Map _materializedViewColToSourceExpr; + + PartitionExprData(Map exprStringToMaterializedViewCol, + Map materializedViewColToSourceExpr) { + _exprStringToMaterializedViewCol = exprStringToMaterializedViewCol; + _materializedViewColToSourceExpr = materializedViewColToSourceExpr; + } + + Map exprStringToMaterializedViewCol() { + return _exprStringToMaterializedViewCol; + } + + Map materializedViewColToSourceExpr() { + return _materializedViewColToSourceExpr; + } + } + + // --------------------------------------------------------------------------- + // Helpers + // --------------------------------------------------------------------------- + + /// Recursively collects all identifier names referenced in an expression tree, + /// skipping alias names (the right-hand side of AS). + private static void collectIdentifiers(Expression expr, Set identifiers) { + if (expr.getType() == ExpressionType.IDENTIFIER) { + String name = expr.getIdentifier().getName(); + if (!"*".equals(name)) { + identifiers.add(name); + } + return; + } + Function func = expr.getFunctionCall(); + if (func != null && func.getOperands() != null) { + if (func.getOperator().equals("as")) { + // Only collect from the actual expression (first operand), not the alias + collectIdentifiers(func.getOperands().get(0), identifiers); + } else { + for (Expression operand : func.getOperands()) { + collectIdentifiers(operand, identifiers); + } + } + } + } + + // --------------------------------------------------------------------------- + // AnalysisResult + // --------------------------------------------------------------------------- + + /// Holds extracted metadata from a successful analysis. Only returned when all validations pass. + public static class AnalysisResult { + private final String _sourceTableName; + private final Set _selectFields; + private final Map _partitionExprMaps; + + AnalysisResult(String sourceTableName, Set selectFields, + Map partitionExprMaps) { + _sourceTableName = sourceTableName; + _selectFields = selectFields; + _partitionExprMaps = partitionExprMaps; + } + + public String getSourceTableName() { + return _sourceTableName; + } + + public Set getSelectFields() { + return _selectFields; + } + + public Map getPartitionExprMaps() { + return _partitionExprMaps; + } + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidator.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidator.java new file mode 100644 index 000000000000..3271034ad772 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidator.java @@ -0,0 +1,286 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.analysis.timeexpr; + +import com.google.common.base.Preconditions; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.pinot.common.function.FunctionRegistry; +import org.apache.pinot.common.request.Expression; +import org.apache.pinot.common.request.ExpressionType; +import org.apache.pinot.common.request.Function; +import org.apache.pinot.common.request.Literal; +import org.apache.pinot.spi.data.DateTimeFieldSpec; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/// Opinionated validator for the MV time column SELECT expression. The MV-side column type is +/// strict — it MUST be [DataType#TIMESTAMP] (epoch millis stored canonically). The base side +/// is unconstrained: any base-table dateTime column is acceptable so long as the SELECT +/// expression produces a millis-epoch value the MV can store as TIMESTAMP. +/// +/// ### Accepted MV time-column SELECT shapes +/// +/// - **Identity passthrough**: a bare identifier equal to the base table's primary time +/// column. Allowed only when the base column is itself TIMESTAMP — otherwise the value +/// would not be a millis-epoch in the MV. +/// - **`DATETRUNC('', baseTimeCol [, MILLISECONDS [, UTC [, MILLISECONDS]]])`** — the +/// unit literal must match the table's declared `bucketTimePeriod`. Allowed only when +/// the base column is TIMESTAMP. Optional trailing args, if present, must be the +/// defaults; non-default values are rejected to keep the semantic surface minimal. +/// - **Arithmetic scaling**: a chain of multiplications whose only identifier leaf is the +/// base time column, with the remaining operands being positive integer literals. The +/// chain may be nested (`base * 24 * 60 * 60 * 1000`) or flat (`base * 86400000`). This +/// is the recommended form when the base column stores a coarser unit (e.g. days, seconds) +/// and the user wants the MV to hold millis-epoch values. The validator does NOT verify +/// the multiplied constants actually convert the base unit to millis — that responsibility +/// sits with the user, but the MV column being TIMESTAMP gives a clear contract. +/// +/// Anything else (DATETIMECONVERT, TODATETIME, mixed addition/subtraction, nested DATETRUNC, +/// non-TIMESTAMP MV column type) is rejected at table-create time with an actionable message. +public final class TimeExprValidator { + + /// `DATETRUNC` units that map to a fixed-size [TimeUnit]. WEEK/MONTH/QUARTER/YEAR are + /// intentionally absent because their bucket size is calendar-dependent (28/29/30/31 days, + /// etc.) and cannot be reconciled with the millis-based `bucketTimePeriod` config. + private static final Map DATETRUNC_UNIT_TO_TIMEUNIT = Map.ofEntries( + Map.entry("MILLISECOND", TimeUnit.MILLISECONDS), + Map.entry("SECOND", TimeUnit.SECONDS), + Map.entry("MINUTE", TimeUnit.MINUTES), + Map.entry("HOUR", TimeUnit.HOURS), + Map.entry("DAY", TimeUnit.DAYS)); + + private static final String DATETRUNC_CANONICAL = "datetrunc"; + private static final String TIMES_CANONICAL = "times"; + private static final String DEFAULT_TIME_UNIT = TimeUnit.MILLISECONDS.name(); + private static final String DEFAULT_TIMEZONE = "UTC"; + + private TimeExprValidator() { + } + + /// Validates the MV time-column SELECT expression and the data types on both sides. + /// + /// @param sourceExpr MV SELECT expression producing the MV time column (alias stripped) + /// @param baseTimeColName name of the base table's primary time column + /// @param baseTimeFieldSpec base table's [DateTimeFieldSpec] for that column + /// @param viewTimeColName MV table's primary time column name (for error messages) + /// @param viewTimeFieldSpec MV table's [DateTimeFieldSpec] for that column + /// @param bucketMs the table's declared `bucketTimePeriod` resolved to millis + /// @throws IllegalStateException with a user-facing message on any violation + public static void validate(Expression sourceExpr, String baseTimeColName, + DateTimeFieldSpec baseTimeFieldSpec, String viewTimeColName, + DateTimeFieldSpec viewTimeFieldSpec, long bucketMs) { + Preconditions.checkNotNull(sourceExpr, "sourceExpr"); + Preconditions.checkNotNull(baseTimeColName, "baseTimeColName"); + Preconditions.checkNotNull(baseTimeFieldSpec, "baseTimeFieldSpec"); + Preconditions.checkNotNull(viewTimeColName, "viewTimeColName"); + Preconditions.checkNotNull(viewTimeFieldSpec, "viewTimeFieldSpec"); + + Preconditions.checkState(viewTimeFieldSpec.getDataType() == DataType.TIMESTAMP, + "MV requires TIMESTAMP-typed time columns. MV time column '%s' has data type %s. " + + "Declare the column with dataType=TIMESTAMP and format='1:MILLISECONDS:TIMESTAMP'.", + viewTimeColName, viewTimeFieldSpec.getDataType()); + + ExpressionType exprType = sourceExpr.getType(); + if (exprType == ExpressionType.IDENTIFIER) { + String actual = sourceExpr.getIdentifier().getName(); + Preconditions.checkState(baseTimeColName.equals(actual), + "MV time column must derive from base time column '%s' (identity, DATETRUNC, or " + + "arithmetic scaling), got identifier '%s'.", + baseTimeColName, actual); + Preconditions.checkState(baseTimeFieldSpec.getDataType() == DataType.TIMESTAMP, + "Identity passthrough requires the base time column '%s' to be TIMESTAMP-typed. " + + "Base column data type is %s; use arithmetic scaling (e.g. '%s * 86400000') to " + + "convert into millis-epoch.", + baseTimeColName, baseTimeFieldSpec.getDataType(), baseTimeColName); + return; + } + + if (exprType == ExpressionType.FUNCTION) { + Function func = sourceExpr.getFunctionCall(); + Preconditions.checkState(func != null, + "MV time column expression is a FUNCTION but has no function call payload"); + String canonical = FunctionRegistry.canonicalize(func.getOperator()); + if (DATETRUNC_CANONICAL.equals(canonical)) { + Preconditions.checkState(baseTimeFieldSpec.getDataType() == DataType.TIMESTAMP, + "DATETRUNC on the MV time column requires the base time column '%s' to be " + + "TIMESTAMP-typed. Base column data type is %s; use arithmetic scaling instead.", + baseTimeColName, baseTimeFieldSpec.getDataType()); + validateDateTrunc(func, baseTimeColName, bucketMs); + return; + } + if (TIMES_CANONICAL.equals(canonical)) { + validateArithmeticScaling(func, baseTimeColName); + return; + } + throw new IllegalStateException( + "MV time column expression uses unsupported function '" + func.getOperator() + + "'. The MV feature accepts only identity passthrough, DATETRUNC, or arithmetic " + + "scaling (multiplication of the base time column by integer constants)."); + } + + throw new IllegalStateException( + "MV time column expression must be either the base time column '" + baseTimeColName + + "' (identity), a DATETRUNC call, or arithmetic scaling. Got expression type: " + exprType); + } + + private static void validateDateTrunc(Function func, String baseTimeColName, long bucketMs) { + List operands = func.getOperands(); + int operandsSize = operands.size(); + Preconditions.checkState(operandsSize >= 2 && operandsSize <= 5, + "DATETRUNC must be called with 2 to 5 arguments, got %s.", operandsSize); + + String unit = requireStringLiteral(operands.get(0), "unit"); + requireIdentifier(operands.get(1), baseTimeColName, + "DATETRUNC second argument must be the base time column"); + + if (operandsSize >= 3) { + String inputTimeUnit = requireStringLiteral(operands.get(2), "inputTimeUnit"); + Preconditions.checkState(DEFAULT_TIME_UNIT.equalsIgnoreCase(inputTimeUnit), + "DATETRUNC inputTimeUnit must be %s (base column is TIMESTAMP / millis), got '%s'.", + DEFAULT_TIME_UNIT, inputTimeUnit); + } + if (operandsSize >= 4) { + String timeZone = requireStringLiteral(operands.get(3), "timeZone"); + Preconditions.checkState(DEFAULT_TIMEZONE.equalsIgnoreCase(timeZone), + "DATETRUNC timeZone must be %s, got '%s'. Non-UTC truncation would shift bucket boundaries " + + "relative to the bucketTimePeriod and is rejected.", + DEFAULT_TIMEZONE, timeZone); + } + if (operandsSize == 5) { + String outputTimeUnit = requireStringLiteral(operands.get(4), "outputTimeUnit"); + Preconditions.checkState(DEFAULT_TIME_UNIT.equalsIgnoreCase(outputTimeUnit), + "DATETRUNC outputTimeUnit must be %s (MV column is TIMESTAMP / millis), got '%s'.", + DEFAULT_TIME_UNIT, outputTimeUnit); + } + + String unitUpper = unit.toUpperCase(Locale.ROOT); + TimeUnit truncTimeUnit = DATETRUNC_UNIT_TO_TIMEUNIT.get(unitUpper); + Preconditions.checkState(truncTimeUnit != null, + "DATETRUNC unit '%s' is not supported by MV. Supported units: %s. WEEK/MONTH/QUARTER/YEAR " + + "have calendar-variable bucket sizes incompatible with bucketTimePeriod.", + unit, DATETRUNC_UNIT_TO_TIMEUNIT.keySet()); + long truncBucketMs = truncTimeUnit.toMillis(1); + Preconditions.checkState(truncBucketMs == bucketMs, + "DATETRUNC unit '%s' (%s ms) does not match the declared bucketTimePeriod (%s ms). " + + "Either change DATETRUNC to a matching unit or update bucketTimePeriod.", + unit, truncBucketMs, bucketMs); + } + + /// Walks a (possibly nested) chain of `times(...)` calls and verifies that exactly one leaf is + /// the base time column and every other leaf is a positive integer literal. Computes the + /// effective scale factor as the product of all literal operands and rejects overflow. + private static void validateArithmeticScaling(Function root, String baseTimeColName) { + long[] scaleHolder = {1L}; + boolean[] sawBase = {false}; + walkMultiplicationChain(asExpression(root), baseTimeColName, scaleHolder, sawBase); + Preconditions.checkState(sawBase[0], + "Arithmetic scaling for the MV time column must reference the base time column '%s' " + + "exactly once as one of the multiplicands.", baseTimeColName); + Preconditions.checkState(scaleHolder[0] > 0, + "Arithmetic scaling for the MV time column must produce a positive scale factor; " + + "got %s.", scaleHolder[0]); + } + + private static Expression asExpression(Function func) { + Expression expr = new Expression(ExpressionType.FUNCTION); + expr.setFunctionCall(func); + return expr; + } + + private static void walkMultiplicationChain(Expression node, String baseTimeColName, + long[] scaleHolder, boolean[] sawBase) { + ExpressionType type = node.getType(); + if (type == ExpressionType.IDENTIFIER) { + String name = node.getIdentifier().getName(); + Preconditions.checkState(baseTimeColName.equals(name), + "Arithmetic scaling for the MV time column may only reference the base time column " + + "'%s'; got identifier '%s'.", + baseTimeColName, name); + Preconditions.checkState(!sawBase[0], + "Arithmetic scaling for the MV time column must reference the base time column '%s' " + + "exactly once; saw it more than once.", + baseTimeColName); + sawBase[0] = true; + return; + } + if (type == ExpressionType.LITERAL) { + long literal = requirePositiveIntegerLiteral(node); + // Multiplicative overflow guard — bucketTimePeriod*days*hours*... must stay positive. + Preconditions.checkState(scaleHolder[0] <= Long.MAX_VALUE / literal, + "Arithmetic scaling for the MV time column overflows; literal operand %s pushes the " + + "running product past Long.MAX_VALUE.", literal); + scaleHolder[0] *= literal; + return; + } + Preconditions.checkState(type == ExpressionType.FUNCTION, + "Arithmetic scaling for the MV time column accepts only identifier or literal operands " + + "at the leaves, got expression type: %s", + type); + Function func = node.getFunctionCall(); + String canonical = FunctionRegistry.canonicalize(func.getOperator()); + Preconditions.checkState(TIMES_CANONICAL.equals(canonical), + "Arithmetic scaling for the MV time column accepts only chained multiplication; got '%s'. " + + "Use a single `times` chain such as `%s * 86400000`.", + func.getOperator(), baseTimeColName); + for (Expression operand : func.getOperands()) { + walkMultiplicationChain(operand, baseTimeColName, scaleHolder, sawBase); + } + } + + private static long requirePositiveIntegerLiteral(Expression expr) { + Literal literal = expr.getLiteral(); + Preconditions.checkState(literal != null, + "Arithmetic scaling for the MV time column requires positive integer literal operands."); + long value; + if (literal.isSetLongValue()) { + value = literal.getLongValue(); + } else if (literal.isSetIntValue()) { + value = literal.getIntValue(); + } else { + throw new IllegalStateException( + "Arithmetic scaling for the MV time column requires positive integer literal operands; " + + "non-integer literal: " + literal); + } + Preconditions.checkState(value > 0, + "Arithmetic scaling for the MV time column requires positive integer literal operands; " + + "got %s.", value); + return value; + } + + private static String requireStringLiteral(Expression expr, String argName) { + Preconditions.checkState(expr.getType() == ExpressionType.LITERAL, + "DATETRUNC argument '%s' must be a string literal.", argName); + Literal literal = expr.getLiteral(); + Preconditions.checkState(literal != null && literal.isSetStringValue(), + "DATETRUNC argument '%s' must be a string literal.", argName); + return literal.getStringValue(); + } + + private static void requireIdentifier(Expression expr, String baseTimeColName, String contextMessage) { + Preconditions.checkState(expr.getType() == ExpressionType.IDENTIFIER, + "%s '%s' as a bare identifier, not a nested expression.", + contextMessage, baseTimeColName); + String actual = expr.getIdentifier().getName(); + Preconditions.checkState(baseTimeColName.equals(actual), + "%s '%s', got '%s'.", contextMessage, baseTimeColName, actual); + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManager.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManager.java new file mode 100644 index 000000000000..2da6a414236e --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManager.java @@ -0,0 +1,598 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.consistency; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; +import org.apache.helix.AccessOption; +import org.apache.helix.store.zk.ZkHelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.helix.zookeeper.zkclient.IZkChildListener; +import org.apache.helix.zookeeper.zkclient.IZkDataListener; +import org.apache.pinot.common.metadata.ZKMetadataProvider; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadata; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadata; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadataUtils; +import org.apache.pinot.materializedview.metadata.PartitionInfo; +import org.apache.pinot.materializedview.metadata.PartitionState; +import org.apache.pinot.materializedview.scheduler.MaterializedViewTaskUtils; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableTaskConfig; +import org.apache.pinot.spi.utils.CommonConstants; +import org.apache.pinot.spi.utils.TimeUtils; +import org.apache.pinot.spi.utils.builder.TableNameBuilder; +import org.apache.zookeeper.data.Stat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/// Manages MV partition consistency by reacting to base table segment changes. +/// +/// When segments are added, replaced, or deleted in a base table, this manager +/// identifies which MV partitions overlap with the changed time range and marks +/// them as [PartitionState#STALE] in [MaterializedViewRuntimeMetadata]. +/// +/// Events are accumulated per base table using a debounce window ([#DEFAULT_DEBOUNCE_DELAY_MS]ms). +/// Multiple segment changes within the window are merged into a single time range and +/// processed as one ZK read-modify-write operation, avoiding excessive ZK traffic during +/// batch ingestion or bulk segment operations. +/// +/// Thread-safety: all public methods are thread-safe. The internal flush runs on a +/// single-threaded scheduler to serialize ZK writes per base table. +/// +///

Partition model (TIME-WINDOWED ONLY in PR 1)

+/// +/// This implementation assumes a single MV partition shape: time-windowed partitions of +/// uniform width `bucketTimePeriod`, keyed by `bucketStartMs`. Range-based notifications +/// (`onBaseTableDataChange(table, startMs, endMs)`) map directly to a contiguous bucket +/// sweep. Future fixed-partition (categorical) MVs will require: +/// +/// - A `PartitionKind` discriminator on [MaterializedViewDefinitionMetadata]. +/// - A non-`Long`-keyed partition map (or a `PartitionKey` abstraction) on +/// [MaterializedViewRuntimeMetadata]. +/// - A separate STALE-marking path that does not iterate buckets by time. +/// +/// See `pinot-materialized-view/DESIGN.md` for the migration plan. Callers that cannot +/// supply a tight time range (or that operate on fixed-partition MVs) should use +/// [#onBaseTableFullInvalidation] rather than passing sentinel time values — that entry +/// point will dispatch on `PartitionKind` once fixed-partition MVs land. +public class MaterializedViewConsistencyManager { + private static final Logger LOGGER = LoggerFactory.getLogger(MaterializedViewConsistencyManager.class); + + /// Compile-time default debounce window for the consistency manager. Overridable per cluster + /// (no restart) via `MaterializedViewTask.CLUSTER_CONFIG_KEY_CONSISTENCY_DEBOUNCE_MS`. + static final long DEFAULT_DEBOUNCE_DELAY_MS = 5000; + private static final String MATERIALIZED_VIEW_DEFINITION_PARENT_PATH = + ZKMetadataProvider.getPropertyStorePathForMaterializedViewDefinitionPrefix(); + private static final String MATERIALIZED_VIEW_DEFINITION_PATH_PREFIX = + MATERIALIZED_VIEW_DEFINITION_PARENT_PATH + "/"; + /// CAS retry budget for STALE-marking writes on the runtime znode. Sized to match the executor's + /// `MAX_RUNTIME_UPDATE_ATTEMPTS` so a STALE marking is never silently dropped in favor of + /// an executor's coverage advance under contention from up to `maxTasksPerBatch` (default 4, + /// cap 1000) parallel completions. With ~5–25 ms jittered backoff per retry, 128 attempts cap + /// total wait near 25 s. + private static final int MAX_MARK_RETRIES = 128; + + + + /// Reverse index: rawBaseTableName → list of viewTableNameWithType. + private final ConcurrentHashMap> _baseTableToMaterializedViewTables = new ConcurrentHashMap<>(); + private final Object _reverseIndexLock = new Object(); + private final DefinitionChangeListener _definitionChangeListener = new DefinitionChangeListener(); + private final Set _subscribedDefinitionPaths = ConcurrentHashMap.newKeySet(); + + /// Debounce buffer: rawBaseTableName → [minAffectedStartMs, maxAffectedEndMs]. + private final ConcurrentHashMap _pendingRanges = new ConcurrentHashMap<>(); + + /// Per-base-table scheduled flush future for debounce cancellation/reset. + private final ConcurrentHashMap> _pendingTimers = new ConcurrentHashMap<>(); + + private final ScheduledExecutorService _scheduler; + private volatile ZkHelixPropertyStore _propertyStore; + /// Optional live cluster-config reader; controller wires this so debounce / retry caps can be + /// overridden at runtime without restart. Null in unit tests; null lookup falls back to the + /// compile-time defaults. + private volatile Function _clusterConfigReader; + + public MaterializedViewConsistencyManager() { + _scheduler = Executors.newSingleThreadScheduledExecutor(r -> { + Thread t = new Thread(r, "materialized-view-consistency-manager"); + t.setDaemon(true); + return t; + }); + } + + /// Wires the live cluster-config reader. Safe to call any time; nulls reset to default behavior. + public void setClusterConfigReader(Function clusterConfigReader) { + _clusterConfigReader = clusterConfigReader; + } + + /// Initializes the manager by scanning existing MV definition metadata to build the reverse index. + /// Must be called after the PropertyStore is ready (after Controller startup). + public void init(ZkHelixPropertyStore propertyStore) { + _propertyStore = propertyStore; + _propertyStore.subscribeChildChanges(MATERIALIZED_VIEW_DEFINITION_PARENT_PATH, _definitionChangeListener); + syncDefinitionDataSubscriptions(null); + rebuildReverseIndex(); + LOGGER.info("MaterializedViewConsistencyManager initialized with {} base table mappings", + _baseTableToMaterializedViewTables.size()); + } + + /// Shuts down the scheduler. Pending flushes are discarded. + public void stop() { + if (_propertyStore != null) { + _propertyStore.unsubscribeChildChanges(MATERIALIZED_VIEW_DEFINITION_PARENT_PATH, _definitionChangeListener); + for (String path : new ArrayList<>(_subscribedDefinitionPaths)) { + _propertyStore.unsubscribeDataChanges(path, _definitionChangeListener); + } + _subscribedDefinitionPaths.clear(); + } + _scheduler.shutdownNow(); + _pendingRanges.clear(); + _pendingTimers.clear(); + LOGGER.info("MaterializedViewConsistencyManager stopped"); + } + + /// Notifies that segments in a base table have changed. The affected time range is + /// accumulated in a debounce buffer; multiple calls within [#DEFAULT_DEBOUNCE_DELAY_MS]ms + /// for the same base table are merged into a single flush. + /// + /// This method is O(1) and non-blocking (just a map merge + timer reset). + /// + /// @param baseTableName raw table name without type suffix + /// @param affectedStartMs earliest startTimeMs among changed segments + /// @param affectedEndMs latest endTimeMs among changed segments + public void onBaseTableDataChange(String baseTableName, long affectedStartMs, long affectedEndMs) { + if (affectedStartMs < 0 || affectedEndMs < 0) { + LOGGER.debug("Skipping MV dirty marking for table {} with invalid time range [{}, {}]", + baseTableName, affectedStartMs, affectedEndMs); + return; + } + // Lock-free fast-path: skip the lock and merge entirely when no MV references this base + // table. Every controller-side ZK segment write now flows through this method (see + // `PinotHelixResourceManager.createSegmentZkMetadata`), so on clusters with zero MVs the + // common case must not contend for `_reverseIndexLock`. ConcurrentHashMap.containsKey is + // safe under concurrent registration / removal; the re-check inside the lock below + // preserves correctness against the race with `onMaterializedViewTableDropped`. + if (!_baseTableToMaterializedViewTables.containsKey(baseTableName)) { + return; + } + // Hold _reverseIndexLock across the membership re-check, range merge, and timer reset so + // a concurrent onMaterializedViewTableDropped cannot race in between the check and the + // merge — without this guard, a dropped MV's base table could leave a pending range + // buffered that the flush eventually no-ops on but still consumes scheduler time + // proportional to the merge rate. + synchronized (_reverseIndexLock) { + if (!_baseTableToMaterializedViewTables.containsKey(baseTableName)) { + return; + } + + _pendingRanges.merge(baseTableName, new long[]{affectedStartMs, affectedEndMs}, + (existing, incoming) -> { + existing[0] = Math.min(existing[0], incoming[0]); + existing[1] = Math.max(existing[1], incoming[1]); + return existing; + }); + + _pendingTimers.compute(baseTableName, (key, prev) -> { + if (prev != null) { + prev.cancel(false); + } + long debounceMs = MaterializedViewTaskUtils.readPositiveLongClusterConfigOrDefault( + _clusterConfigReader, + CommonConstants.MaterializedViewTask.CLUSTER_CONFIG_KEY_CONSISTENCY_DEBOUNCE_MS, + DEFAULT_DEBOUNCE_DELAY_MS); + return _scheduler.schedule(() -> flush(baseTableName), debounceMs, TimeUnit.MILLISECONDS); + }); + } + } + + /// Convenience entry point that marks every covered partition of every dependent MV STALE + /// without requiring the caller to know which time range or partition key was affected. + /// + /// Used in two situations today: + /// + /// - Time-windowed MVs whose base segments lack `startTimeMs` / `endTimeMs` metadata, so the + /// controller cannot compute a tight range. Conservative full-invalidation prevents leaking + /// stale VALID partitions; the consistency manager's per-MV watermark cap (`markPartitionsDirty`) + /// bounds the iteration cost so this remains cheap even on long-history MVs. + /// - Future fixed-partition MVs (see `DESIGN.md`), where the controller cannot in general + /// determine which categorical partition a base segment touched without inspecting its + /// data; full invalidation is the only honest signal at the consistency-manager layer. + /// + /// Implemented internally as a `[0, Long.MAX_VALUE)` invalidation routed through the same + /// debounce + watermark-capped iteration path as range invalidations, so the on-disk effect + /// (which partitions become STALE) is identical to an explicit "everything covered" range + /// for time MVs. Callers should prefer this method over passing `Long.MAX_VALUE` directly to + /// `onBaseTableDataChange` — the typed entry point keeps the intent visible at the call site + /// and lets the implementation evolve (e.g. specialized fixed-partition path) without + /// touching every caller. + public void onBaseTableFullInvalidation(String baseTableName) { + onBaseTableDataChange(baseTableName, 0L, Long.MAX_VALUE); + } + + /// Returns the list of materialized-view tables (with `_OFFLINE` / `_REALTIME` suffix) that + /// depend on `rawBaseTableName`. Returns an empty list when no MV references this base. + /// + /// Used by the controller to block base-table deletion when dependent MVs exist — operator + /// must drop the MV(s) first to surface the dependency rather than orphan them. + public List getDependentMaterializedViews(String rawBaseTableName) { + List dependents = _baseTableToMaterializedViewTables.get(rawBaseTableName); + if (dependents == null || dependents.isEmpty()) { + return Collections.emptyList(); + } + // Defensive copy so a caller doesn't accidentally mutate the live reverse index. + return new ArrayList<>(dependents); + } + + /// Updates the reverse index when a new MV table is created. + public void onMaterializedViewTableCreated(String viewTableNameWithType, List baseTables) { + synchronized (_reverseIndexLock) { + for (String baseTable : baseTables) { + List materializedViewTables = + _baseTableToMaterializedViewTables.computeIfAbsent(baseTable, k -> new CopyOnWriteArrayList<>()); + if (!materializedViewTables.contains(viewTableNameWithType)) { + materializedViewTables.add(viewTableNameWithType); + } + } + } + LOGGER.info("Registered MV table {} with base tables {}", viewTableNameWithType, baseTables); + } + + /// Updates the reverse index when an MV table is dropped. + public void onMaterializedViewTableDropped(String viewTableNameWithType, List baseTables) { + synchronized (_reverseIndexLock) { + for (String baseTable : baseTables) { + List materializedViewTables = _baseTableToMaterializedViewTables.get(baseTable); + if (materializedViewTables != null) { + materializedViewTables.remove(viewTableNameWithType); + if (materializedViewTables.isEmpty()) { + _baseTableToMaterializedViewTables.remove(baseTable); + } + } + } + } + LOGGER.info("Unregistered MV table {} from base tables {}", viewTableNameWithType, baseTables); + } + + /// Processes all accumulated changes for a base table in one batch. + /// For each affected MV, performs one ZK read + one ZK write on the runtime ZNode. + @VisibleForTesting + void flush(String baseTableName) { + // Atomic snapshot-and-clear: drain `_pendingRanges` and `_pendingTimers` inside a single + // `_pendingTimers.compute` lambda so a concurrent `onBaseTableDataChange` cannot land between + // the two `remove()` calls. Without this, a debounce reschedule between the removes can drop + // a pending range (the new future is removed from the map but the buffer entry it would have + // flushed stays orphaned until the next event arrives). + long[] range = drainPendingForFlush(baseTableName); + if (range == null) { + return; + } + + List materializedViewTables; + synchronized (_reverseIndexLock) { + materializedViewTables = _baseTableToMaterializedViewTables.get(baseTableName); + if (materializedViewTables == null || materializedViewTables.isEmpty()) { + return; + } + materializedViewTables = new ArrayList<>(materializedViewTables); + } + + LOGGER.info("Flushing MV dirty marking for base table: {}, affected range: [{}, {}], " + + "affected MV tables: {}", baseTableName, range[0], range[1], materializedViewTables); + + for (String viewTableName : materializedViewTables) { + markPartitionsDirtyWithRetry(viewTableName, range[0], range[1]); + } + } + + /// Atomically removes both the pending range and the pending timer entry for a base table. + /// Returns the range to flush, or `null` if there is nothing pending. The single + /// `_pendingTimers.compute` lambda ensures `onBaseTableDataChange` cannot interleave + /// between draining the buffer and clearing the timer — which would otherwise let a freshly + /// scheduled timer be evicted from the map while its buffered range stays orphaned. + private long[] drainPendingForFlush(String baseTableName) { + long[][] drained = new long[1][]; + _pendingTimers.compute(baseTableName, (key, prev) -> { + drained[0] = _pendingRanges.remove(baseTableName); + // Returning null removes this map entry. Whether `prev` is the future this flush owns or a + // newer one scheduled after our scheduler thread woke up, dropping it is safe — if a newer + // event re-merges into _pendingRanges after this lambda returns, it will also call + // _pendingTimers.compute and schedule a fresh timer. + return null; + }); + return drained[0]; + } + + private void markPartitionsDirtyWithRetry(String viewTableName, long affectedStartMs, long affectedEndMs) { + // Retry only on CAS conflict (markPartitionsDirty returns false). Any thrown exception + // is a real error (corrupt znode, ZK unavailability, serialization bug, etc.) — retrying + // 128× burns ~3 s of scheduler thread time and the operator never learns about the bug. + // Fail loud after one occurrence so monitoring picks it up; the next base-table change + // for the same MV will re-trigger this code path and try again from a known-bad-state log. + for (int attempt = 0; attempt < MAX_MARK_RETRIES; attempt++) { + try { + if (markPartitionsDirty(viewTableName, affectedStartMs, affectedEndMs)) { + return; + } + LOGGER.debug("CAS conflict on attempt {} for MV table: {}, retrying", attempt + 1, viewTableName); + } catch (Exception e) { + LOGGER.error("Failed to mark dirty partitions for MV table: {} on attempt {} due to a " + + "non-CAS exception. Aborting retries — investigate the underlying ZK/serialization " + + "issue. MV may serve stale data until the next base-table change re-triggers " + + "consistency.", viewTableName, attempt + 1, e); + return; + } + // Jittered backoff: 5–25 ms × 128 attempts ≤ ~3 s total. Prevents tight CAS-loop livelock + // against the executor (which uses the same backoff window). + try { + Thread.sleep(5L + ThreadLocalRandom.current().nextInt(20)); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + LOGGER.warn("Interrupted while marking dirty for MV table: {}", viewTableName); + return; + } + } + LOGGER.error("Failed to mark dirty partitions for MV table: {} after {} CAS retries. " + + "MV may serve stale data until the next base-table change re-triggers consistency.", + viewTableName, MAX_MARK_RETRIES); + } + + /// Marks overlapping VALID partitions as STALE in the MV runtime metadata. + /// + /// @return true if succeeded or nothing to mark; false if CAS failed (caller should retry) + private boolean markPartitionsDirty(String viewTableName, long affectedStartMs, long affectedEndMs) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(viewTableName); + Stat stat = new Stat(); + ZNRecord znRecord = _propertyStore.get(path, stat, AccessOption.PERSISTENT); + if (znRecord == null) { + return true; + } + + MaterializedViewRuntimeMetadata runtime = MaterializedViewRuntimeMetadata.fromZNRecord(znRecord); + Map partitionInfos = runtime.getPartitions(); + + long bucketMs = inferBucketMs(viewTableName, partitionInfos); + + // Cap affectedEndMs at watermarkMs. No partition with partStart > watermarkMs can exist + // (the writer invariant), so any bucket beyond that cannot be marked STALE anyway. This + // protects the bucket-known iteration loop below from a caller passing Long.MAX_VALUE + // (full-range invalidation from notifyMaterializedViewConsistencyManager paths when + // segment startTime/endTime is unknown), which would otherwise loop ~watermarkMs/bucketMs + // times — orders of magnitude more than the number of real partitions. + affectedEndMs = Math.min(affectedEndMs, runtime.getWatermarkMs()); + if (affectedEndMs < affectedStartMs) { + LOGGER.debug("Affected range [{}, {}] is past watermarkMs ({}) for MV table: {}; nothing to mark", + affectedStartMs, affectedEndMs, runtime.getWatermarkMs(), viewTableName); + return true; + } + + Map updatedInfos = new HashMap<>(partitionInfos); + boolean anyChanged = false; + int markedCount = 0; + + // bucketMs is required by the analyzer at MV-table creation, so it should always be > 0 + // here. Fail loud if not — a missing bucket config means MV state is unrecoverable + // without operator intervention; silent over-marking would only make it worse. + Preconditions.checkState(bucketMs > 0, + "MV table %s: bucketTimePeriod is required but bucketMs=%s; consistency manager cannot " + + "mark partitions without a bucket size. Repair the MV table config.", + viewTableName, bucketMs); + + // Iterate every bucket [partStart, partStart+bucketMs) that overlaps the affected range + // [affectedStartMs, affectedEndMs]. Two ranges overlap when start1 <= end2 AND end1 >= start2. + // The first overlapping bucket has partStart = floorDiv(affectedStartMs, bucketMs) * bucketMs; + // the last has partStart <= affectedEndMs (any bucket whose start is past affectedEndMs + // cannot overlap because partStart > affectedEndMs >= affectedStartMs implies the bucket + // starts after the affected range ends). floorDiv (instead of /) is used defensively so a + // future caller passing negative affectedStartMs would still produce the correct floor. + // Only flip existing VALID entries to STALE. Absent buckets are NOT synthesized — under + // Design C, a bucket's absence from the partition map already means "MV does not cover this + // range", so the broker rewrite (PR 2) routes those queries to the base table. Synthesizing + // STALE entries for every uncovered bucket below `watermarkMs` would explode the znode size + // on a full-range invalidation (~watermarkMs / bucketMs entries) without affecting routing + // correctness — the bucket-iteration loop below stays O(affectedRange / bucketMs) but the + // persisted map grows only with real partitions. + long partStart = Math.floorDiv(affectedStartMs, bucketMs) * bucketMs; + while (partStart <= affectedEndMs) { + PartitionInfo info = updatedInfos.get(partStart); + if (info != null && info.getState() == PartitionState.VALID) { + updatedInfos.put(partStart, info.withState(PartitionState.STALE)); + anyChanged = true; + markedCount++; + } + partStart += bucketMs; + } + + if (!anyChanged) { + LOGGER.debug("No VALID partitions to mark STALE for MV table: {} in range [{}, {}]", + viewTableName, affectedStartMs, affectedEndMs); + return true; + } + + MaterializedViewRuntimeMetadata updated = new MaterializedViewRuntimeMetadata( + runtime.getMaterializedViewTableNameWithType(), runtime.getWatermarkMs(), updatedInfos); + + // Route through `persist()` so every writer site is funnelled through the same + // `validateForPersist` gate. Translate ONLY `CasConflictException` into the retry-loop + // boolean — real validation failures (IllegalStateException / IllegalArgumentException + // from validateForPersist) and underlying ZK errors propagate so they surface at the + // task / log level rather than being silently retried 128×. + try { + MaterializedViewRuntimeMetadataUtils.persist(_propertyStore, updated, stat.getVersion()); + LOGGER.info("Marked {} partition(s) STALE for MV table: {} (range [{}, {}])", + markedCount, viewTableName, affectedStartMs, affectedEndMs); + return true; + } catch (MaterializedViewRuntimeMetadataUtils.CasConflictException e) { + LOGGER.debug("CAS conflict marking partitions STALE for MV table: {} (range [{}, {}]); will retry", + viewTableName, affectedStartMs, affectedEndMs); + return false; + } + } + + /// Infers the bucket size in millis for the given MV table. First tries to read it from + /// the MV table's task config; falls back to computing the GCD of consecutive partition + /// start times from the partitionInfos map. + private long inferBucketMs(String viewTableName, Map partitionInfos) { + long fromConfig = readBucketMsFromTableConfig(viewTableName); + if (fromConfig > 0) { + return fromConfig; + } + return inferBucketMsFromPartitions(partitionInfos); + } + + private long readBucketMsFromTableConfig(String viewTableName) { + try { + String viewTableWithType = TableNameBuilder.OFFLINE.tableNameWithType(viewTableName); + TableConfig tableConfig = ZKMetadataProvider.getTableConfig(_propertyStore, viewTableWithType); + if (tableConfig == null) { + return -1; + } + TableTaskConfig taskConfig = tableConfig.getTaskConfig(); + if (taskConfig == null) { + return -1; + } + Map viewTaskConfigs = + taskConfig.getConfigsForTaskType(CommonConstants.MaterializedViewTask.TASK_TYPE); + if (viewTaskConfigs == null) { + return -1; + } + String bucketPeriod = viewTaskConfigs.get(CommonConstants.MaterializedViewTask.BUCKET_TIME_PERIOD_KEY); + if (bucketPeriod == null || bucketPeriod.isEmpty()) { + return -1; + } + return TimeUtils.convertPeriodToMillis(bucketPeriod); + } catch (Exception e) { + LOGGER.debug("Failed to read bucket config for MV table: {}", viewTableName, e); + return -1; + } + } + + /// Computes bucket size by finding the minimum gap between sorted partition start times. + private long inferBucketMsFromPartitions(Map partitionInfos) { + if (partitionInfos.size() < 2) { + return -1; + } + List sortedKeys = new ArrayList<>(partitionInfos.keySet()); + sortedKeys.sort(Long::compareTo); + + long minGap = Long.MAX_VALUE; + for (int i = 1; i < sortedKeys.size(); i++) { + long gap = sortedKeys.get(i) - sortedKeys.get(i - 1); + if (gap > 0) { + minGap = Math.min(minGap, gap); + } + } + return minGap == Long.MAX_VALUE ? -1 : minGap; + } + + /// Scans all MV definition ZNodes to build the baseTable → materializedViewTable reverse index. + private void rebuildReverseIndex() { + String defBasePath = ZKMetadataProvider.getPropertyStorePathForMaterializedViewDefinitionPrefix(); + Map> rebuiltIndex = new HashMap<>(); + List children = _propertyStore.getChildNames(defBasePath, AccessOption.PERSISTENT); + if (children == null || children.isEmpty()) { + synchronized (_reverseIndexLock) { + _baseTableToMaterializedViewTables.clear(); + } + LOGGER.debug("No MV definition metadata found during reverse index rebuild"); + return; + } + + for (String viewTableName : children) { + try { + String fullPath = defBasePath + "/" + viewTableName; + ZNRecord record = _propertyStore.get(fullPath, null, AccessOption.PERSISTENT); + if (record == null) { + continue; + } + MaterializedViewDefinitionMetadata definition = MaterializedViewDefinitionMetadata.fromZNRecord(record); + for (String baseTable : definition.getBaseTables()) { + rebuiltIndex.computeIfAbsent(baseTable, k -> new ArrayList<>()) + .add(viewTableName); + } + } catch (Exception e) { + LOGGER.warn("Failed to load MV definition for: {}", viewTableName, e); + } + } + synchronized (_reverseIndexLock) { + _baseTableToMaterializedViewTables.clear(); + for (Map.Entry> entry : rebuiltIndex.entrySet()) { + _baseTableToMaterializedViewTables.put(entry.getKey(), new CopyOnWriteArrayList<>(entry.getValue())); + } + } + LOGGER.info("Rebuilt MV reverse index: {}", _baseTableToMaterializedViewTables); + } + + private void syncDefinitionDataSubscriptions(List children) { + List definitionChildren = children != null ? children + : _propertyStore.getChildNames(MATERIALIZED_VIEW_DEFINITION_PARENT_PATH, AccessOption.PERSISTENT); + Set currentPaths = new HashSet<>(); + if (definitionChildren != null) { + for (String viewTableName : definitionChildren) { + String path = MATERIALIZED_VIEW_DEFINITION_PATH_PREFIX + viewTableName; + currentPaths.add(path); + if (_subscribedDefinitionPaths.add(path)) { + _propertyStore.subscribeDataChanges(path, _definitionChangeListener); + } + } + } + for (String path : new ArrayList<>(_subscribedDefinitionPaths)) { + if (!currentPaths.contains(path) && _subscribedDefinitionPaths.remove(path)) { + _propertyStore.unsubscribeDataChanges(path, _definitionChangeListener); + } + } + } + + private class DefinitionChangeListener implements IZkChildListener, IZkDataListener { + @Override + public void handleChildChange(String path, List children) { + syncDefinitionDataSubscriptions(children); + rebuildReverseIndex(); + } + + @Override + public void handleDataChange(String path, Object data) { + rebuildReverseIndex(); + } + + @Override + public void handleDataDeleted(String path) { + if (_subscribedDefinitionPaths.remove(path)) { + _propertyStore.unsubscribeDataChanges(path, this); + } + rebuildReverseIndex(); + } + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/context/MaterializedViewTaskGeneratorContext.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/context/MaterializedViewTaskGeneratorContext.java new file mode 100644 index 000000000000..9340df92286b --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/context/MaterializedViewTaskGeneratorContext.java @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.context; + +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import javax.annotation.Nullable; +import org.apache.helix.store.HelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.data.Schema; + + +/// Role-neutral context needed by the materialized-view analyzer and task scheduler. +/// +/// The minion task plugin provides the controller-backed implementation. The analyzer and +/// scheduler remain in this module and only depend on this narrow contract. +/// +/// Existence-probing is explicit: callers MUST invoke [#tableExists] before +/// [#getTableConfig] / [#getTableSchema] if the table is not already known to exist +/// (e.g. the analyzer's OFFLINE→REALTIME source-table fallback). The latter two methods +/// throw `IllegalStateException` on absence rather than returning null, so consumer-site +/// call paths can never accidentally swallow a missing-entity error. +public interface MaterializedViewTaskGeneratorContext { + + /// Returns `true` when a table with the given fully-qualified name exists in cluster metadata. + /// Used by the analyzer's source-table resolver to probe both OFFLINE and REALTIME variants + /// at MV-create time before committing to one form. + boolean tableExists(String tableNameWithType); + + /// Returns the table config for the given fully-qualified table name (with `_OFFLINE` or + /// `_REALTIME` suffix). Throws `IllegalStateException` when the table does not exist. + /// + /// Callers that need to probe for existence MUST use [#tableExists] first; this method + /// assumes the caller has already established the table exists. Base-table delete is + /// blocked at the controller when dependent MVs exist (see + /// `MaterializedViewConsistencyManager#getDependentMaterializedViews`), so once the MV + /// passes create-time validation, downstream consumers (scheduler, executor, consistency + /// manager) can rely on the source table being present. + TableConfig getTableConfig(String tableNameWithType); + + /// Returns the schema for the given table. Schemas in Pinot are stored by raw table name; + /// for caller convenience implementations MUST accept either form (raw name or + /// name-with-type suffix) and strip the suffix internally before lookup. + /// + /// Throws `IllegalStateException` when no schema is registered for the table. The analyzer + /// invokes this on a name it has already confirmed exists; the throw signals a cluster-state + /// inconsistency (e.g. an aborted table-create that registered the config but not the + /// schema) which must fail the operation rather than be silently null-handled downstream. + Schema getTableSchema(String tableName); + + HelixPropertyStore getPropertyStore(); + + List getSegmentsZKMetadata(String tableNameWithType); + + String getVipUrl(); + + void forRunningTasks(String tableNameWithType, String taskType, Consumer> taskConfigConsumer); + + /// Returns the live value of the given Helix cluster-config key, or `null` if unset. + /// + /// Implementations MUST read the value on every call (no caching) so an operator updating + /// the cluster config sees the change applied without a controller / minion restart. + /// Callers parse the returned string and apply their own default on null / malformed values. + @Nullable + String getClusterConfig(String configName); +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutor.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutor.java new file mode 100644 index 000000000000..f66167aedc1b --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutor.java @@ -0,0 +1,301 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.executor; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.helix.HelixManager; +import org.apache.helix.model.InstanceConfig; +import org.apache.pinot.common.compression.CompressionFactory; +import org.apache.pinot.common.compression.Compressor; +import org.apache.pinot.common.config.GrpcConfig; +import org.apache.pinot.common.proto.Broker; +import org.apache.pinot.common.response.broker.ResultTable; +import org.apache.pinot.common.response.encoder.ResponseEncoder; +import org.apache.pinot.common.response.encoder.ResponseEncoderFactory; +import org.apache.pinot.common.utils.DataSchema; +import org.apache.pinot.common.utils.grpc.BrokerGrpcQueryClient; +import org.apache.pinot.common.utils.helix.HelixHelper; +import org.apache.pinot.spi.utils.CommonConstants; +import org.apache.pinot.spi.utils.InstanceTypeUtils; +import org.apache.pinot.spi.utils.JsonUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/// gRPC-based implementation of [MaterializedViewQueryExecutor]. +/// +/// Features: +/// +/// - **Load balancing**: discovers all gRPC-enabled brokers via Helix and +/// selects them in round-robin order. +/// - **Connection reuse**: caches [BrokerGrpcQueryClient] instances per +/// broker endpoint (`host_port`), following the same pattern used in +/// `GrpcConnection.BrokerStreamingQueryClient`. +/// - **Stale client cleanup**: when the broker list is refreshed, clients for +/// brokers that are no longer in the cluster are closed and evicted. +/// +/// +/// Instances are thread-safe and intended to be long-lived (one per executor factory). +public class GrpcMaterializedViewQueryExecutor implements MaterializedViewQueryExecutor { + private static final Logger LOGGER = LoggerFactory.getLogger(GrpcMaterializedViewQueryExecutor.class); + + private final HelixManager _helixManager; + private final GrpcConfig _grpcConfig; + private final ConcurrentHashMap _clientCache = new ConcurrentHashMap<>(); + private final AtomicInteger _roundRobinCounter = new AtomicInteger(0); + + public GrpcMaterializedViewQueryExecutor(HelixManager helixManager, GrpcConfig grpcConfig) { + _helixManager = helixManager; + _grpcConfig = grpcConfig; + } + + @Override + public QueryHandle executeQuery(String sql, Map authHeaders) + throws IOException { + Pair broker = selectBroker(); + LOGGER.info("Selected broker gRPC endpoint: {}:{}", broker.getLeft(), broker.getRight()); + + BrokerGrpcQueryClient client = getOrCreateClient(broker.getLeft(), broker.getRight()); + + Broker.BrokerRequest brokerRequest = Broker.BrokerRequest.newBuilder() + .setSql(sql) + .putAllMetadata(authHeaders) + .build(); + Iterator responseIterator = client.submit(brokerRequest); + + Preconditions.checkState(responseIterator.hasNext(), + "gRPC broker %s:%d returned no response for query: %s. Check broker health and gRPC connectivity.", + broker.getLeft(), broker.getRight(), sql); + Broker.BrokerResponse metadataResponse = responseIterator.next(); + JsonNode metadataJson = JsonUtils.bytesToJsonNode(metadataResponse.getPayload().toByteArray()); + if (metadataJson.has("exceptions") && !metadataJson.get("exceptions").isEmpty()) { + throw new IOException("Query execution failed with exceptions: " + metadataJson.get("exceptions")); + } + + // The broker gRPC protocol always sends the schema frame after metadata, even for queries + // that match zero rows. A missing schema therefore indicates a real protocol error + // (broker version mismatch, truncated stream, etc.) — fail loud with actionable diagnostics + // so the operator can identify and fix the underlying issue rather than silently advancing + // the watermark with no data. + Preconditions.checkState(responseIterator.hasNext(), + "gRPC broker %s:%d sent metadata but no schema frame for query: %s. " + + "Indicates a broker protocol error (version mismatch or truncated stream). " + + "Empty result sets still include a schema frame.", + broker.getLeft(), broker.getRight(), sql); + Broker.BrokerResponse schemaResponse = responseIterator.next(); + DataSchema dataSchema = DataSchema.fromBytes(schemaResponse.getPayload().asReadOnlyByteBuffer()); + + return new GrpcQueryHandle(broker, dataSchema, responseIterator); + } + + /// Streaming handle that decodes one gRPC data frame at a time. Heap residency is bounded by + /// the size of the currently-buffered frame, NOT the total query result size. + private static final class GrpcQueryHandle implements QueryHandle { + private final Pair _broker; + private final DataSchema _dataSchema; + private final Iterator _responseIterator; + private final FrameRowIterator _rowIterator; + + private GrpcQueryHandle(Pair broker, DataSchema dataSchema, + Iterator responseIterator) { + _broker = broker; + _dataSchema = dataSchema; + _responseIterator = responseIterator; + _rowIterator = new FrameRowIterator(); + } + + @Override + public DataSchema getDataSchema() { + return _dataSchema; + } + + @Override + public Iterator rows() { + return _rowIterator; + } + + @Override + public void close() { + // Drain any remaining gRPC frames so the underlying call's server-streaming RPC is + // properly terminated. The Helix-managed channel is shared and cached; not draining + // here would leak the stream until the channel closes. + while (_responseIterator.hasNext()) { + _responseIterator.next(); + } + } + + /// Iterator that pulls one gRPC data frame at a time and decodes its rows into a small + /// buffered list; advances to the next frame only when the current one is exhausted. + private final class FrameRowIterator implements Iterator { + private List _currentFrameRows; + private int _cursor; + private long _totalRowsEmitted; + private long _framesDecoded; + + @Override + public boolean hasNext() { + while (_currentFrameRows == null || _cursor >= _currentFrameRows.size()) { + if (!_responseIterator.hasNext()) { + return false; + } + decodeNextFrame(); + } + return true; + } + + @Override + public Object[] next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + _totalRowsEmitted++; + return _currentFrameRows.get(_cursor++); + } + + private void decodeNextFrame() { + Broker.BrokerResponse dataResponse = _responseIterator.next(); + Map responseMetadata = dataResponse.getMetadataMap(); + String compressionAlgorithm = responseMetadata.getOrDefault( + CommonConstants.Broker.Grpc.COMPRESSION, CommonConstants.Broker.Grpc.DEFAULT_COMPRESSION); + Compressor compressor = CompressionFactory.getCompressor(compressionAlgorithm); + String encodingType = responseMetadata.getOrDefault( + CommonConstants.Broker.Grpc.ENCODING, CommonConstants.Broker.Grpc.DEFAULT_ENCODING); + ResponseEncoder responseEncoder = ResponseEncoderFactory.getResponseEncoder(encodingType); + + byte[] respBytes = dataResponse.getPayload().toByteArray(); + String rowSizeStr = responseMetadata.get("rowSize"); + Preconditions.checkNotNull(rowSizeStr, + "gRPC response metadata missing required 'rowSize' field"); + int rowSize = Integer.parseInt(rowSizeStr); + byte[] uncompressedPayload; + try { + uncompressedPayload = compressor.decompress(respBytes); + } catch (Exception e) { + throw new RuntimeException("Failed to decompress gRPC response payload", e); + } + ResultTable resultTable; + try { + resultTable = responseEncoder.decodeResultTable(uncompressedPayload, rowSize, _dataSchema); + } catch (IOException e) { + throw new RuntimeException("Failed to decode gRPC response frame", e); + } + _currentFrameRows = resultTable.getRows(); + _cursor = 0; + _framesDecoded++; + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("gRPC frame {}: decoded {} rows from broker {}:{}", + _framesDecoded, _currentFrameRows.size(), _broker.getLeft(), _broker.getRight()); + } + } + } + } + + /// Discovers all gRPC-enabled brokers from Helix and selects one using round-robin. + /// Also evicts cached clients for brokers that are no longer present. + @VisibleForTesting + Pair selectBroker() { + List> brokerEndpoints = discoverBrokerEndpoints(); + Preconditions.checkState(!brokerEndpoints.isEmpty(), + "No broker with gRPC enabled found in the cluster"); + + evictStaleClients(brokerEndpoints); + + int index = Math.abs(_roundRobinCounter.getAndIncrement() % brokerEndpoints.size()); + Pair selected = brokerEndpoints.get(index); + getOrCreateClient(selected.getLeft(), selected.getRight()); + return selected; + } + + /// Scans Helix instance configs for brokers that have a gRPC port configured. + private List> discoverBrokerEndpoints() { + List> endpoints = new ArrayList<>(); + List instanceConfigs = HelixHelper.getInstanceConfigs(_helixManager); + for (InstanceConfig instanceConfig : instanceConfigs) { + if (!InstanceTypeUtils.isBroker(instanceConfig.getInstanceName())) { + continue; + } + String grpcPortStr = instanceConfig.getRecord() + .getSimpleField(CommonConstants.Helix.Instance.GRPC_PORT_KEY); + if (grpcPortStr != null) { + int grpcPort = Integer.parseInt(grpcPortStr); + if (grpcPort > 0) { + endpoints.add(Pair.of(instanceConfig.getHostName(), grpcPort)); + } + } + } + return endpoints; + } + + /// Closes and removes cached clients for brokers that are no longer in the cluster. + private void evictStaleClients(List> currentEndpoints) { + Set activeKeys = currentEndpoints.stream() + .map(p -> clientKey(p.getLeft(), p.getRight())) + .collect(Collectors.toSet()); + + for (String cachedKey : _clientCache.keySet()) { + if (!activeKeys.contains(cachedKey)) { + BrokerGrpcQueryClient removed = _clientCache.remove(cachedKey); + if (removed != null) { + LOGGER.info("Evicting stale gRPC client for broker: {}", cachedKey); + removed.close(); + } + } + } + } + + private BrokerGrpcQueryClient getOrCreateClient(String host, int port) { + String key = clientKey(host, port); + return _clientCache.computeIfAbsent(key, + k -> new BrokerGrpcQueryClient(host, port, _grpcConfig)); + } + + private static String clientKey(String host, int port) { + return host + "_" + port; + } + + @VisibleForTesting + int getCachedClientCount() { + return _clientCache.size(); + } + + @Override + public void close() { + for (BrokerGrpcQueryClient client : _clientCache.values()) { + try { + client.close(); + } catch (Exception e) { + LOGGER.warn("Error closing gRPC client", e); + } + } + _clientCache.clear(); + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/MaterializedViewQueryExecutor.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/MaterializedViewQueryExecutor.java new file mode 100644 index 000000000000..45c8a14b98a2 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/executor/MaterializedViewQueryExecutor.java @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.executor; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; +import org.apache.pinot.common.utils.DataSchema; + + +/// Abstraction for executing SQL queries against Pinot and streaming typed result rows. +/// +/// Implementations may use different transport protocols (e.g. gRPC, Arrow Flight) +/// and are responsible for broker discovery, connection management, load balancing, +/// and response deserialization. +/// +/// **Streaming contract:** [#executeQuery] returns a [QueryHandle] after the schema frame has +/// been consumed but BEFORE the data rows are buffered. The caller pulls rows from +/// [QueryHandle#rows] on demand; the implementation backs the iterator with the underlying +/// transport stream and decodes one chunk at a time. Heap residency is therefore bounded by +/// the caller's chunk size, not by the total row count of the query — large MV windows that +/// would have OOM'd a buffer-the-whole-result API stream through fine. +/// +/// **Lifecycle:** the caller MUST close the [QueryHandle] (try-with-resources). Close cancels +/// the stream if the iterator hasn't been fully drained, releasing the underlying transport +/// resources and any buffered frame. +/// +/// Instances are expected to be long-lived and thread-safe so they can be shared across +/// multiple task executions. A single [QueryHandle], however, is single-threaded. +public interface MaterializedViewQueryExecutor extends Closeable { + + /// Issues the query and returns a streaming handle. Blocks until the metadata + schema frames + /// have been received from the broker; data frames are pulled lazily as the caller iterates. + /// + /// @param sql the SQL query to execute + /// @param authHeaders authentication headers to include in the request + /// @return a handle exposing the schema and a row iterator; caller MUST close + /// @throws IOException if communication with the broker fails or the schema frame is missing + QueryHandle executeQuery(String sql, Map authHeaders) + throws IOException; + + /// Streaming handle for a single executing query: exposes the schema plus a row iterator + /// backed by the underlying transport. + /// + /// Contract: + /// + /// - [#getDataSchema] is safe to call any time; the schema is available as soon as + /// [#executeQuery] returns and never changes for the lifetime of the handle. + /// - [#rows] returns the same iterator instance on every call; the underlying stream is + /// single-pass, so calling [#rows] more than once produces the same iterator and is + /// therefore equivalent to caching the first return value. + /// - The iterator is single-pass and single-threaded. + /// [Iterator#next] throws [java.util.NoSuchElementException] when the stream is + /// exhausted (caller must drive iteration with [Iterator#hasNext]). + /// - [#close] drains any remaining stream frames so the underlying transport (e.g. gRPC + /// server-streaming RPC) is properly terminated. Implementations MAY also issue an + /// explicit cancel to free server resources sooner; the contract guarantees only that + /// the underlying call ends. It is idempotent: a second invocation no-ops. Using + /// [#rows] after close is undefined behavior. + /// - The handle is NOT safe to share across threads. A single task executor owns the + /// handle for the duration of one query. + /// - Overrides [Closeable#close] to drop the `IOException` declaration: the gRPC stream + /// drain on close cannot fail in a way the caller can recover from, so any underlying + /// transport error is rethrown as an unchecked exception rather than forcing + /// try-with-resources callers to catch a checked exception. + interface QueryHandle extends Closeable { + DataSchema getDataSchema(); + + Iterator rows(); + + @Override + void close(); + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadata.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadata.java new file mode 100644 index 000000000000..0057f1219a3a --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadata.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.pinot.spi.utils.JsonUtils; + + +/// Stores the static definition of a materialized view: what it is derived from, +/// the SQL that produces it, how time columns map, and what split parameters are needed. +/// +/// Persisted in ZooKeeper under `/CONFIGS/MATERIALIZED_VIEW/DEFINITION/`. +/// This ZNode changes only when the materialized view is created or its definition is altered — never +/// during routine task execution or partition-state changes. +/// +/// Thread-safety: instances are effectively immutable after construction. +public class MaterializedViewDefinitionMetadata { + + private static final String BASE_TABLES_KEY = "baseTables"; + private static final String DEFINED_SQL_KEY = "definedSql"; + private static final String PARTITION_EXPR_MAPS_KEY = "partitionExprMaps"; + private static final String SPLIT_SOURCE_TIME_COLUMN_KEY = "splitSourceTimeColumn"; + private static final String SPLIT_SOURCE_TIME_FORMAT_KEY = "splitSourceTimeFormat"; + private static final String SPLIT_MATERIALIZED_VIEW_TIME_COLUMN_KEY = "splitMaterializedViewTimeColumn"; + private static final String SPLIT_BUCKET_MS_KEY = "splitBucketMs"; + private static final String STALENESS_THRESHOLD_MS_KEY = "stalenessThresholdMs"; + private static final String REWRITE_ENABLED_KEY = "rewriteEnabled"; + + private static final TypeReference> STRING_LIST_TYPE = + new TypeReference>() { }; + private static final TypeReference> STRING_MAP_TYPE = + new TypeReference>() { }; + + private final String _materializedViewTableNameWithType; + private final List _baseTables; + private final String _definedSql; + + /// Maps base-table expression strings to MV column identifiers, recording how each base + /// table time column expression is transformed into the corresponding MV time column. + /// For example: `{"dateTimeConvert(ts,'1:MILLISECONDS:EPOCH','1:DAYS:EPOCH','1:DAYS')": "materializedViewDay"`} + /// or for a simple pass-through: `{"ts": "ts"`}. + private final Map _partitionExprMaps; + + @Nullable + private final MaterializedViewSplitSpec _splitSpec; + + /// Per-MV staleness SLO (millis). `0` means "no SLO check". Broker excludes the MV when + /// `(now - watermarkMs) > stalenessThresholdMs`. + private final long _stalenessThresholdMs; + + /// Per-MV rewrite kill switch. `true` (default) means broker may rewrite user queries to + /// this MV when subsumption holds. Operators can set `false` to keep ingestion running + /// while temporarily routing all queries to the base table (e.g. during MV migration / + /// schema bring-up). + private final boolean _rewriteEnabled; + + public MaterializedViewDefinitionMetadata(String viewTableNameWithType, List baseTables, + String definedSql, Map partitionExprMaps, + @Nullable MaterializedViewSplitSpec splitSpec) { + this(viewTableNameWithType, baseTables, definedSql, partitionExprMaps, splitSpec, 0L, true); + } + + public MaterializedViewDefinitionMetadata(String viewTableNameWithType, List baseTables, + String definedSql, Map partitionExprMaps, + @Nullable MaterializedViewSplitSpec splitSpec, long stalenessThresholdMs, boolean rewriteEnabled) { + _materializedViewTableNameWithType = viewTableNameWithType; + _baseTables = baseTables; + _definedSql = definedSql; + _partitionExprMaps = partitionExprMaps; + _splitSpec = splitSpec; + _stalenessThresholdMs = stalenessThresholdMs; + _rewriteEnabled = rewriteEnabled; + } + + public String getMaterializedViewTableNameWithType() { + return _materializedViewTableNameWithType; + } + + public List getBaseTables() { + return _baseTables; + } + + public String getDefinedSql() { + return _definedSql; + } + + public Map getPartitionExprMaps() { + return _partitionExprMaps; + } + + @Nullable + public MaterializedViewSplitSpec getSplitSpec() { + return _splitSpec; + } + + public long getStalenessThresholdMs() { + return _stalenessThresholdMs; + } + + public boolean isRewriteEnabled() { + return _rewriteEnabled; + } + + public ZNRecord toZNRecord() { + ZNRecord znRecord = new ZNRecord(_materializedViewTableNameWithType); + try { + znRecord.setSimpleField(BASE_TABLES_KEY, JsonUtils.objectToString(_baseTables)); + if (_definedSql != null) { + znRecord.setSimpleField(DEFINED_SQL_KEY, _definedSql); + } + if (_partitionExprMaps != null && !_partitionExprMaps.isEmpty()) { + znRecord.setSimpleField(PARTITION_EXPR_MAPS_KEY, JsonUtils.objectToString(_partitionExprMaps)); + } + } catch (JsonProcessingException e) { + throw new IllegalStateException("Failed to serialize MaterializedViewDefinitionMetadata", e); + } + + if (_splitSpec != null) { + znRecord.setSimpleField(SPLIT_SOURCE_TIME_COLUMN_KEY, _splitSpec.getSourceTimeColumn()); + znRecord.setSimpleField(SPLIT_SOURCE_TIME_FORMAT_KEY, _splitSpec.getSourceTimeFormat()); + znRecord.setSimpleField(SPLIT_MATERIALIZED_VIEW_TIME_COLUMN_KEY, _splitSpec.getMaterializedViewTimeColumn()); + znRecord.setLongField(SPLIT_BUCKET_MS_KEY, _splitSpec.getBucketMs()); + } + + if (_stalenessThresholdMs > 0) { + znRecord.setLongField(STALENESS_THRESHOLD_MS_KEY, _stalenessThresholdMs); + } + // Always persist rewriteEnabled so toggling false sticks; reader defaults to true on absence + // for backward compat with pre-V2 definitions (rewrite enabled is the safe default). + znRecord.setBooleanField(REWRITE_ENABLED_KEY, _rewriteEnabled); + + return znRecord; + } + + public static MaterializedViewDefinitionMetadata fromZNRecord(ZNRecord znRecord) { + String viewTableNameWithType = znRecord.getId(); + try { + String baseTablesJson = znRecord.getSimpleField(BASE_TABLES_KEY); + List baseTables = baseTablesJson != null + ? JsonUtils.stringToObject(baseTablesJson, STRING_LIST_TYPE) + : Collections.emptyList(); + + String definedSql = znRecord.getSimpleField(DEFINED_SQL_KEY); + + String partitionExprMapsJson = znRecord.getSimpleField(PARTITION_EXPR_MAPS_KEY); + Map partitionExprMaps = partitionExprMapsJson != null + ? JsonUtils.stringToObject(partitionExprMapsJson, STRING_MAP_TYPE) + : new HashMap<>(); + + MaterializedViewSplitSpec splitSpec = null; + String sourceTimeColumn = znRecord.getSimpleField(SPLIT_SOURCE_TIME_COLUMN_KEY); + if (sourceTimeColumn != null) { + String sourceTimeFormat = znRecord.getSimpleField(SPLIT_SOURCE_TIME_FORMAT_KEY); + String viewTimeColumn = znRecord.getSimpleField(SPLIT_MATERIALIZED_VIEW_TIME_COLUMN_KEY); + long bucketMs = znRecord.getLongField(SPLIT_BUCKET_MS_KEY, 0L); + splitSpec = + new MaterializedViewSplitSpec(sourceTimeColumn, sourceTimeFormat, viewTimeColumn, bucketMs); + } + + long stalenessThresholdMs = znRecord.getLongField(STALENESS_THRESHOLD_MS_KEY, 0L); + boolean rewriteEnabled = znRecord.getBooleanField(REWRITE_ENABLED_KEY, true); + + return new MaterializedViewDefinitionMetadata(viewTableNameWithType, baseTables, definedSql, + partitionExprMaps, splitSpec, stalenessThresholdMs, rewriteEnabled); + } catch (IOException e) { + throw new IllegalStateException("Failed to deserialize MaterializedViewDefinitionMetadata from ZNRecord", e); + } + } + + /// Specifies the time columns used to express the split boundary `watermarkMs`: + /// + /// - Source (base) side: filter `sourceTimeColumn >= watermarkMs`. The base column may use + /// any [DateTimeFieldSpec] format — the broker converts `watermarkMs` to the source's + /// native unit using `sourceTimeFormat` before attaching the filter. + /// - MV side: filter `viewTimeColumn < watermarkMs`. The MV column is constrained to + /// [DataType#TIMESTAMP] (epoch millis) by [MaterializedViewAnalyzer], so the literal is + /// always the raw `watermarkMs` value. + /// + /// Thread-safety: instances are immutable. + public static class MaterializedViewSplitSpec { + private final String _sourceTimeColumn; + private final String _sourceTimeFormat; + private final String _materializedViewTimeColumn; + private final long _bucketMs; + + public MaterializedViewSplitSpec(String sourceTimeColumn, String sourceTimeFormat, + String viewTimeColumn, long bucketMs) { + _sourceTimeColumn = sourceTimeColumn; + _sourceTimeFormat = sourceTimeFormat; + _materializedViewTimeColumn = viewTimeColumn; + _bucketMs = bucketMs; + } + + public String getSourceTimeColumn() { + return _sourceTimeColumn; + } + + public String getSourceTimeFormat() { + return _sourceTimeFormat; + } + + public String getMaterializedViewTimeColumn() { + return _materializedViewTimeColumn; + } + + public long getBucketMs() { + return _bucketMs; + } + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadataUtils.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadataUtils.java new file mode 100644 index 000000000000..15a11ec18af1 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewDefinitionMetadataUtils.java @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import javax.annotation.Nullable; +import org.apache.helix.AccessOption; +import org.apache.helix.store.HelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.helix.zookeeper.zkclient.exception.ZkException; +import org.apache.pinot.common.metadata.ZKMetadataProvider; +import org.apache.zookeeper.data.Stat; + + +/// Utility methods to fetch/persist [MaterializedViewDefinitionMetadata] from/to ZooKeeper +/// under the path `/CONFIGS/MATERIALIZED_VIEW/DEFINITION/`. +public final class MaterializedViewDefinitionMetadataUtils { + + private MaterializedViewDefinitionMetadataUtils() { + } + + @Nullable + public static MaterializedViewDefinitionMetadata fetch(HelixPropertyStore propertyStore, + String viewTableNameWithType) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewDefinition(viewTableNameWithType); + Stat stat = new Stat(); + ZNRecord znRecord = propertyStore.get(path, stat, AccessOption.PERSISTENT); + if (znRecord == null) { + return null; + } + znRecord.setVersion(stat.getVersion()); + return MaterializedViewDefinitionMetadata.fromZNRecord(znRecord); + } + + public static void persist(HelixPropertyStore propertyStore, + MaterializedViewDefinitionMetadata metadata, int expectedVersion) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewDefinition( + metadata.getMaterializedViewTableNameWithType()); + if (!propertyStore.set(path, metadata.toZNRecord(), expectedVersion, AccessOption.PERSISTENT)) { + throw new ZkException("Failed to persist MaterializedViewDefinitionMetadata for: " + + metadata.getMaterializedViewTableNameWithType()); + } + } + + /// Creates the definition metadata znode only if it does not already exist. Returns true on + /// success; returns false if a concurrent writer already created the znode. Used on + /// cold-start so two scheduler runs do not clobber each other's `partitionExprMaps` / + /// `splitSpec` (which can diverge under a racing schema update). + public static boolean createIfAbsent(HelixPropertyStore propertyStore, + MaterializedViewDefinitionMetadata metadata) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewDefinition( + metadata.getMaterializedViewTableNameWithType()); + return propertyStore.create(path, metadata.toZNRecord(), AccessOption.PERSISTENT); + } + + public static void delete(HelixPropertyStore propertyStore, + String viewTableNameWithType) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewDefinition(viewTableNameWithType); + if (!propertyStore.remove(path, AccessOption.PERSISTENT)) { + throw new ZkException("Failed to delete MaterializedViewDefinitionMetadata for: " + viewTableNameWithType); + } + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadata.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadata.java new file mode 100644 index 000000000000..c21a7bc09e5d --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadata.java @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import com.google.common.base.Preconditions; +import java.util.HashMap; +import java.util.Map; +import org.apache.helix.zookeeper.datamodel.ZNRecord; + + +/// Stores the mutable runtime state of a materialized view: how far it has been materialized +/// and per-partition consistency info. +/// +/// Persisted in ZooKeeper under `/CONFIGS/MATERIALIZED_VIEW/RUNTIME/`. +/// This ZNode is updated by the Minion Executor (APPEND/OVERWRITE/DELETE) and the Controller +/// ConsistencyManager (VALID → STALE marking). +/// +/// ### Coverage model (Design C) +/// +/// The partition map IS the authoritative coverage: a bucket entry's presence + state +/// determines whether the broker can serve queries against that time range from the MV. +/// +/// - `watermarkMs` — scheduling hint: highest contiguous VALID block from epoch up. Used +/// by the scheduler to drive APPEND task selection and by the broker as the split point +/// for the 2-way SPLIT_REWRITE. Not used as a coverage boundary directly — the +/// partition map controls per-bucket routing. +/// - `partitions` — `bucketStart → PartitionInfo(VALID|STALE, fingerprint, lastRefreshTime)`. +/// A bucket's absence from this map means "MV does not cover this time range"; +/// deletion is modeled by removing the entry (no separate EXPIRED state). +/// +/// Freshness is derived on read (e.g. `now - watermarkMs > stalenessThresholdMs` ⇒ STALE); +/// there is no persistent freshness field. +/// +/// ### Partition model (TIME-WINDOWED ONLY in PR 1) +/// +/// `_partitions` is keyed by `Long bucketStartMs`. The wire format already stores keys as +/// strings (`Long.toString(bucketStart)` in [#toZNRecord], parsed in [#fromZNRecord]), so the +/// on-disk schema is partition-shape neutral and can carry future categorical keys without +/// a breaking change. The in-memory key type, however, is `Long` today and will need to +/// generalize (to `String`, or to a `PartitionKey` sum type) when fixed-partition MVs land. +/// See `pinot-materialized-view/DESIGN.md` for the migration plan. +/// +/// Thread-safety: instances are effectively immutable after construction. +public class MaterializedViewRuntimeMetadata { + private static final String WATERMARK_MS_KEY = "watermarkMs"; + private static final String PARTITION_INFOS_MAP_KEY = "partitionInfos"; + + private final String _materializedViewTableNameWithType; + private final long _watermarkMs; + private final Map _partitions; + + public MaterializedViewRuntimeMetadata(String viewTableNameWithType, long watermarkMs, + Map partitions) { + Preconditions.checkArgument(watermarkMs >= 0, + "watermarkMs must be non-negative, got: %s", watermarkMs); + _materializedViewTableNameWithType = viewTableNameWithType; + _watermarkMs = watermarkMs; + // Defensive copy: the class advertises immutability, but the underlying PartitionInfo entries + // are themselves immutable. Copying just the map structure prevents callers from mutating + // their handle after construction and silently corrupting the cached / persisted view. + _partitions = partitions == null ? Map.of() : Map.copyOf(partitions); + } + + /// Validates the writer-side invariants before persistence. Writers MUST invoke this. + public void validateForPersist() { + // No cross-field invariants under Design C: partitions map IS the coverage, watermarkMs is + // a derived scheduling hint. Kept for forward-compatibility — future invariants can be + // added here. + } + + public String getMaterializedViewTableNameWithType() { + return _materializedViewTableNameWithType; + } + + public long getWatermarkMs() { + return _watermarkMs; + } + + public Map getPartitions() { + return _partitions; + } + + public ZNRecord toZNRecord() { + ZNRecord znRecord = new ZNRecord(_materializedViewTableNameWithType); + znRecord.setLongField(WATERMARK_MS_KEY, _watermarkMs); + + // Each partition becomes its own map-field entry keyed by `bucketStartMs`. This uses + // ZNRecord's native two-level structure (top-level mapFields → Map per + // partition) so the on-the-wire shape is structured rather than packed strings. + for (Map.Entry entry : _partitions.entrySet()) { + znRecord.setMapField(Long.toString(entry.getKey()), entry.getValue().toFieldMap()); + } + + return znRecord; + } + + public static MaterializedViewRuntimeMetadata fromZNRecord(ZNRecord znRecord) { + String viewTableNameWithType = znRecord.getId(); + long watermarkMs = znRecord.getLongField(WATERMARK_MS_KEY, 0L); + + Map partitions = new HashMap<>(); + Map> mapFields = znRecord.getMapFields(); + if (mapFields != null) { + for (Map.Entry> entry : mapFields.entrySet()) { + // Skip the legacy combined-partitions key from V1 if encountered (forward-compat read). + if (PARTITION_INFOS_MAP_KEY.equals(entry.getKey())) { + continue; + } + try { + long partitionStartMs = Long.parseLong(entry.getKey()); + partitions.put(partitionStartMs, PartitionInfo.fromFieldMap(entry.getValue())); + } catch (NumberFormatException e) { + // Non-numeric map-field keys are not partitions; ignore for forward compat. + } + } + } + + return new MaterializedViewRuntimeMetadata(viewTableNameWithType, watermarkMs, partitions); + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadataUtils.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadataUtils.java new file mode 100644 index 000000000000..2433037a7f15 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/MaterializedViewRuntimeMetadataUtils.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import javax.annotation.Nullable; +import org.apache.helix.AccessOption; +import org.apache.helix.store.HelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.helix.zookeeper.zkclient.exception.ZkException; +import org.apache.pinot.common.metadata.ZKMetadataProvider; +import org.apache.zookeeper.data.Stat; + + +/// Utility methods to fetch/persist [MaterializedViewRuntimeMetadata] from/to ZooKeeper +/// under the path `/CONFIGS/MATERIALIZED_VIEW/RUNTIME/`. +public final class MaterializedViewRuntimeMetadataUtils { + + private MaterializedViewRuntimeMetadataUtils() { + } + + @Nullable + public static MaterializedViewRuntimeMetadata fetch(HelixPropertyStore propertyStore, + String viewTableNameWithType) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(viewTableNameWithType); + Stat stat = new Stat(); + ZNRecord znRecord = propertyStore.get(path, stat, AccessOption.PERSISTENT); + if (znRecord == null) { + return null; + } + znRecord.setVersion(stat.getVersion()); + return MaterializedViewRuntimeMetadata.fromZNRecord(znRecord); + } + + /// Fetches runtime metadata along with its ZK stat version. + /// The version is set on the returned ZNRecord and can be used for CAS writes. + /// + /// @return the runtime metadata, or `null` if not found + @Nullable + public static MaterializedViewRuntimeMetadata fetchWithVersion(HelixPropertyStore propertyStore, + String viewTableNameWithType, Stat outStat) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(viewTableNameWithType); + ZNRecord znRecord = propertyStore.get(path, outStat, AccessOption.PERSISTENT); + if (znRecord == null) { + return null; + } + return MaterializedViewRuntimeMetadata.fromZNRecord(znRecord); + } + + /// Persists the runtime metadata via version-checked CAS write. + /// + /// `expectedVersion` MUST match the ZK version of the value the caller fetched (use + /// `fetchWithVersion`). A version mismatch is reported via a typed exception so callers + /// can distinguish "another writer beat us — retry" from "the metadata itself is invalid": + /// + /// - [CasConflictException] — ZK rejected the write because the version changed. Callers + /// SHOULD re-fetch and retry. + /// - [IllegalStateException] / [IllegalArgumentException] — `validateForPersist` rejected + /// the instance. Callers MUST NOT retry; the data is structurally invalid. + /// - [ZkException] — underlying ZK transport / session failure. Callers SHOULD retry + /// with backoff but should NOT treat as a routine CAS conflict. + public static void persist(HelixPropertyStore propertyStore, + MaterializedViewRuntimeMetadata metadata, int expectedVersion) { + // Strict writer-side check: refuse to persist an instance that violates any + // forward-compatibility invariants enforced by validateForPersist(). Reads + // (constructor) tolerate slightly inconsistent historical data; writes never + // propagate inconsistency forward. + metadata.validateForPersist(); + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime( + metadata.getMaterializedViewTableNameWithType()); + if (!propertyStore.set(path, metadata.toZNRecord(), expectedVersion, AccessOption.PERSISTENT)) { + throw new CasConflictException("CAS conflict persisting MaterializedViewRuntimeMetadata for: " + + metadata.getMaterializedViewTableNameWithType() + + " (expectedVersion=" + expectedVersion + " did not match)"); + } + } + + /// Thrown by [#persist] when the version-checked CAS write is rejected because another + /// writer mutated the znode first. Carries the message so the retry loop can surface + /// the conflict at ERROR level when retries exhaust. + public static final class CasConflictException extends ZkException { + public CasConflictException(String message) { + super(message); + } + } + + /// Creates the runtime metadata znode only if it does not already exist. Returns true on + /// success; returns false if a concurrent writer already created the znode (the caller can + /// then re-fetch and proceed with that value). Used on cold-start to avoid two scheduler + /// runs blind-clobbering each other. + public static boolean createIfAbsent(HelixPropertyStore propertyStore, + MaterializedViewRuntimeMetadata metadata) { + metadata.validateForPersist(); + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime( + metadata.getMaterializedViewTableNameWithType()); + return propertyStore.create(path, metadata.toZNRecord(), AccessOption.PERSISTENT); + } + + public static void delete(HelixPropertyStore propertyStore, + String viewTableNameWithType) { + String path = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(viewTableNameWithType); + if (!propertyStore.remove(path, AccessOption.PERSISTENT)) { + throw new ZkException("Failed to delete MaterializedViewRuntimeMetadata for: " + viewTableNameWithType); + } + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionFingerprint.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionFingerprint.java new file mode 100644 index 000000000000..a80c935c5dc9 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionFingerprint.java @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.TreeMap; + + +/// Fingerprint of a materialized partition, capturing how many base segments contributed +/// and their aggregate CRC. Used to detect when base table data has changed since the +/// partition was last materialized. +/// +/// Single-entry serialized form: `"segmentCount,crcChecksum"` for ZNRecord map fields. +/// +/// Map serialized form (for task config transport): +/// `"partStartMs1=segCnt1,crc1;partStartMs2=segCnt2,crc2"`. +/// +/// Thread-safety: instances are immutable after construction. +public class PartitionFingerprint { + private static final char SEPARATOR = ','; + + /// Number of base table segments whose time range overlaps this partition window. + private final int _segmentCount; + + /// Sum of CRC values from all overlapping base segments, recorded at materialization time. + /// + /// This field is NOT used during the event-driven dirty marking phase + /// (`MaterializedViewConsistencyManager`), which marks partitions as STALE based + /// solely on time-range overlap with changed segments. + /// + /// It IS used by the Generator's precise verification step + /// (`tryGenerateOverwriteTask`): when a partition is already marked STALE, the + /// Generator re-computes the current fingerprint and compares it against this stored + /// baseline via [#equals]. Without the CRC, a scenario where one segment is + /// deleted and a different segment is uploaded would leave `segmentCount` unchanged, + /// causing the Generator to incorrectly revert the partition to VALID. The CRC sum + /// will differ in that case, correctly confirming the data change. + /// + /// The Executor writes this value after successful materialization, establishing the + /// baseline snapshot for future Generator comparisons. + private final long _crcChecksum; + + public PartitionFingerprint(int segmentCount, long crcChecksum) { + _segmentCount = segmentCount; + _crcChecksum = crcChecksum; + } + + public int getSegmentCount() { + return _segmentCount; + } + + public long getCrcChecksum() { + return _crcChecksum; + } + + /// Encodes this fingerprint as `"segmentCount,crcChecksum"`. + public String encode() { + return _segmentCount + String.valueOf(SEPARATOR) + _crcChecksum; + } + + /// Decodes a fingerprint from the format `"segmentCount,crcChecksum"`. + /// + /// @throws IllegalArgumentException if the string is malformed + public static PartitionFingerprint decode(String encoded) { + int separatorIdx = encoded.indexOf(SEPARATOR); + if (separatorIdx < 0) { + throw new IllegalArgumentException("Invalid PartitionFingerprint encoding: " + encoded); + } + int segmentCount = Integer.parseInt(encoded.substring(0, separatorIdx)); + long crcChecksum = Long.parseLong(encoded.substring(separatorIdx + 1)); + return new PartitionFingerprint(segmentCount, crcChecksum); + } + + /// Encodes a map of partition fingerprints as + /// `"partStartMs1=segCnt1,crc1;partStartMs2=segCnt2,crc2"`. + /// Entries are sorted by `partStartMs` key so the output is deterministic across JVMs and + /// stable across decode/encode round-trips. Callers that key caches or logs on this string + /// rely on byte-identical output for the same logical map. + public static String encodeMap(Map map) { + if (map == null || map.isEmpty()) { + return ""; + } + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry entry : new TreeMap<>(map).entrySet()) { + if (!first) { + sb.append(';'); + } + sb.append(entry.getKey()).append('=').append(entry.getValue().encode()); + first = false; + } + return sb.toString(); + } + + /// Decodes a map of partition fingerprints from the format produced by [#encodeMap]. + /// + /// @return empty map if the input is null or blank + /// @throws IllegalArgumentException if any entry is malformed + public static Map decodeMap(String encoded) { + Map map = new HashMap<>(); + if (encoded == null || encoded.isEmpty()) { + return map; + } + for (String entry : encoded.split(";")) { + int eqIdx = entry.indexOf('='); + if (eqIdx < 0) { + throw new IllegalArgumentException("Invalid partition fingerprint map entry: " + entry); + } + long partitionStartMs = Long.parseLong(entry.substring(0, eqIdx)); + PartitionFingerprint fp = decode(entry.substring(eqIdx + 1)); + map.put(partitionStartMs, fp); + } + return map; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + PartitionFingerprint that = (PartitionFingerprint) o; + return _segmentCount == that._segmentCount && _crcChecksum == that._crcChecksum; + } + + @Override + public int hashCode() { + return Objects.hash(_segmentCount, _crcChecksum); + } + + @Override + public String toString() { + return "PartitionFingerprint{segmentCount=" + _segmentCount + ", crcChecksum=" + _crcChecksum + "}"; + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionInfo.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionInfo.java new file mode 100644 index 000000000000..0dfdafe54a10 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionInfo.java @@ -0,0 +1,131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import com.google.common.base.Preconditions; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + + +/// Tracks the state and provenance of a single materialized partition. +/// +/// Fields: +/// +/// - `state` – whether the partition is up-to-date ([PartitionState#VALID]) +/// or needs re-materialization ([PartitionState#STALE]). +/// - `fingerprint` – the base segment snapshot (count + CRC) recorded when +/// the partition was last materialized. +/// - `lastRefreshTime` – wall clock time (millis) of the last successful materialization. +/// +/// Serialized as a typed `Map` ZNRecord map-field entry with keys +/// `state`, `segmentCount`, `crc`, `lastRefreshTime`. This shape is forward-compatible +/// (new fields can be added without breaking older readers, which ignore unknown keys) +/// unlike the prior packed `"V,10,5000,1700006400000"` string format. +/// +/// Thread-safety: instances are immutable after construction. +public class PartitionInfo { + private static final String STATE_KEY = "state"; + private static final String SEGMENT_COUNT_KEY = "segmentCount"; + private static final String CRC_KEY = "crc"; + private static final String LAST_REFRESH_TIME_KEY = "lastRefreshTime"; + + private final PartitionState _state; + private final PartitionFingerprint _fingerprint; + private final long _lastRefreshTime; + + public PartitionInfo(PartitionState state, PartitionFingerprint fingerprint, long lastRefreshTime) { + _state = state; + _fingerprint = fingerprint; + _lastRefreshTime = lastRefreshTime; + } + + public PartitionState getState() { + return _state; + } + + public PartitionFingerprint getFingerprint() { + return _fingerprint; + } + + public long getLastRefreshTime() { + return _lastRefreshTime; + } + + /// Creates a new `PartitionInfo` with the given state, keeping fingerprint and + /// lastRefreshTime unchanged. + public PartitionInfo withState(PartitionState newState) { + return new PartitionInfo(newState, _fingerprint, _lastRefreshTime); + } + + /// Serializes to a typed map suitable for `ZNRecord.setMapField(bucketStart, ...)`. + public Map toFieldMap() { + Map map = new HashMap<>(4); + map.put(STATE_KEY, _state.encode()); + map.put(SEGMENT_COUNT_KEY, Integer.toString(_fingerprint.getSegmentCount())); + map.put(CRC_KEY, Long.toString(_fingerprint.getCrcChecksum())); + map.put(LAST_REFRESH_TIME_KEY, Long.toString(_lastRefreshTime)); + return map; + } + + /// Deserializes from the typed field map produced by [#toFieldMap]. Unknown extra keys + /// are ignored (forward compatibility for future field additions). + /// + /// @throws IllegalArgumentException if any required key is missing or malformed + public static PartitionInfo fromFieldMap(Map map) { + Preconditions.checkArgument(map != null, "PartitionInfo field map must not be null"); + String stateStr = map.get(STATE_KEY); + String segmentCountStr = map.get(SEGMENT_COUNT_KEY); + String crcStr = map.get(CRC_KEY); + String lastRefreshTimeStr = map.get(LAST_REFRESH_TIME_KEY); + Preconditions.checkArgument(stateStr != null && segmentCountStr != null && crcStr != null + && lastRefreshTimeStr != null, + "PartitionInfo field map missing required keys; got: %s", map); + PartitionState state = PartitionState.decode(stateStr); + int segmentCount = Integer.parseInt(segmentCountStr); + long crcChecksum = Long.parseLong(crcStr); + long lastRefreshTime = Long.parseLong(lastRefreshTimeStr); + return new PartitionInfo(state, new PartitionFingerprint(segmentCount, crcChecksum), lastRefreshTime); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + PartitionInfo that = (PartitionInfo) o; + return _lastRefreshTime == that._lastRefreshTime + && _state == that._state + && Objects.equals(_fingerprint, that._fingerprint); + } + + @Override + public int hashCode() { + return Objects.hash(_state, _fingerprint, _lastRefreshTime); + } + + @Override + public String toString() { + return "PartitionInfo{state=" + _state + ", fingerprint=" + _fingerprint + + ", lastRefreshTime=" + _lastRefreshTime + "}"; + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionState.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionState.java new file mode 100644 index 000000000000..6c67139bceb3 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/metadata/PartitionState.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + + +/// State of a materialized partition in the MV lifecycle. +/// +/// - `VALID` – partition is up-to-date with base table data. +/// - `STALE` – base table data has changed since last materialization; partition +/// needs OVERWRITE. +/// +/// Partition expiration is modeled by **absence** from the runtime metadata's +/// partition map, not as a separate state. The DELETE task path removes the +/// map entry; the broker then treats that bucket as "not covered by the MV" +/// and routes those queries to the base table. +/// +/// Encoded as a single character (`"V"` / `"S"`) for compact ZK storage. +public enum PartitionState { + VALID("V"), + STALE("S"); + + private final String _code; + + PartitionState(String code) { + _code = code; + } + + public String encode() { + return _code; + } + + public static PartitionState decode(String code) { + switch (code) { + case "V": + return VALID; + case "S": + return STALE; + default: + throw new IllegalArgumentException("Unknown PartitionState code: " + code); + } + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskScheduler.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskScheduler.java new file mode 100644 index 000000000000..86708f9f6b29 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskScheduler.java @@ -0,0 +1,1009 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.scheduler; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.hash.Hasher; +import com.google.common.hash.Hashing; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.regex.Pattern; +import org.apache.helix.store.HelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; +import org.apache.pinot.core.common.MinionConstants; +import org.apache.pinot.core.minion.PinotTaskConfig; +import org.apache.pinot.materializedview.analysis.MaterializedViewAnalyzer; +import org.apache.pinot.materializedview.context.MaterializedViewTaskGeneratorContext; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadata; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadata.MaterializedViewSplitSpec; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadataUtils; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadata; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadataUtils; +import org.apache.pinot.materializedview.metadata.PartitionFingerprint; +import org.apache.pinot.materializedview.metadata.PartitionInfo; +import org.apache.pinot.materializedview.metadata.PartitionState; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableTaskConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.DateTimeFieldSpec; +import org.apache.pinot.spi.data.DateTimeFormatSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; +import org.apache.pinot.spi.utils.TimeUtils; +import org.apache.pinot.spi.utils.builder.TableNameBuilder; +import org.apache.zookeeper.data.Stat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/// Task generator for [MaterializedViewTask]. +/// +/// Unlike segment-conversion tasks, this generator does not scan source segments. It only +/// computes a time window and appends it to the user-defined SQL, producing a +/// [PinotTaskConfig] for the executor. +/// +/// Two-step decision logic (evaluated per table, per schedule cycle): +/// +/// - **Overwrite STALE** – If any partition is marked [PartitionState#STALE] +/// (by the event-driven `MaterializedViewConsistencyManager`), the generator +/// performs a precise fingerprint verification. If the data truly changed, it generates +/// an `OVERWRITE` task for the earliest stale partition. If the fingerprint +/// matches (false positive), the partition is reverted to [PartitionState#VALID]. +/// This step has the highest priority to maintain consistency. +/// - **Append** – If no STALE partitions exist and the watermark can advance (next +/// window is outside the buffer period), generate a normal `APPEND` task. +/// +/// +/// Dirty marking (STALE detection) is handled externally by +/// `MaterializedViewConsistencyManager`, which reacts to base table segment changes +/// (add, replace, delete) and proactively marks affected partitions in +/// [MaterializedViewRuntimeMetadata]. +/// +///

Partition model (TIME-WINDOWED ONLY in PR 1)

+/// +/// All selection logic assumes time-windowed partitions of fixed width `bucketTimePeriod`: +/// the watermark is `long ms`, APPEND windows are `[watermarkMs, watermarkMs+bucketMs)`, and +/// the per-window source-time predicate is spliced into the SQL via text manipulation. A +/// future fixed-partition MV will need a different selection strategy (FIFO over STALE +/// partitions keyed by categorical id, no time arithmetic, no watermark advance) — see +/// `pinot-materialized-view/DESIGN.md`. The minion executor itself is partition-shape +/// neutral: it runs whichever SQL the scheduler emits. +public class MaterializedViewTaskScheduler { + private static final Logger LOGGER = LoggerFactory.getLogger(MaterializedViewTaskScheduler.class); + + /// Identifier pattern used to validate a column name before it is interpolated into the + /// time-range filter SQL fragment. Compiled once to avoid recompiling on every call. + private static final Pattern IDENTIFIER_PATTERN = Pattern.compile("[A-Za-z_][A-Za-z0-9_.]*"); + + /// Compile-time default for the APPEND batch-scheduling-loop iteration cap. Overridable + /// per cluster (no restart) via `MaterializedViewTask.CLUSTER_CONFIG_KEY_MAX_BATCH_LOOP_ITERATIONS`. + /// Sized to comfortably exceed any realistic `availableSlots + partitionInfos.size()` + /// for production MVs (a 10-year hourly MV has ~88k partitions, a 30-year daily MV ~11k); + /// pathological maps beyond that stop the loop early so the scheduler can recover on the + /// next cycle. Distinct from [MaterializedViewTask#MAX_TASKS_PER_BATCH_USER_CAP], + /// which is the user-facing upper bound on the `maxTasksPerBatch` table-config value. + static final int DEFAULT_MAX_BATCH_LOOP_ITERATIONS = 100_000; + + private final MaterializedViewTaskGeneratorContext _context; + + public MaterializedViewTaskScheduler(MaterializedViewTaskGeneratorContext context) { + _context = context; + } + + public String getTaskType() { + return MaterializedViewTask.TASK_TYPE; + } + + public List generateTasks(List tableConfigs) { + String taskType = MaterializedViewTask.TASK_TYPE; + List pinotTaskConfigs = new ArrayList<>(); + + for (TableConfig tableConfig : tableConfigs) { + String offlineTableName = tableConfig.getTableName(); + + if (tableConfig.getTableType() != TableType.OFFLINE) { + LOGGER.warn("Skip generating task: {} for non-OFFLINE table: {}", taskType, offlineTableName); + continue; + } + LOGGER.info("Start generating task configs for table: {} for task: {}", offlineTableName, taskType); + + TableTaskConfig tableTaskConfig = tableConfig.getTaskConfig(); + Preconditions.checkState(tableTaskConfig != null); + Map taskConfigs = tableTaskConfig.getConfigsForTaskType(taskType); + Preconditions.checkState(taskConfigs != null, "Task config shouldn't be null for table: %s", offlineTableName); + + String definedSQL = taskConfigs.get(MaterializedViewTask.DEFINED_SQL_KEY); + Preconditions.checkState(definedSQL != null && !definedSQL.isEmpty(), + "definedSQL must be specified for table: %s", offlineTableName); + + String sourceTableName = MaterializedViewAnalyzer.extractSourceTableName(definedSQL); + String sourceTableWithType = resolveSourceTableNameWithType(sourceTableName); + + // Bucket and buffer. bucketTimePeriod is required (validated by MaterializedViewAnalyzer + // at create time; re-checked here so a hand-edited table config does not silently fall + // back to an implicit default). + String bucketTimePeriod = taskConfigs.get(MaterializedViewTask.BUCKET_TIME_PERIOD_KEY); + Preconditions.checkState(bucketTimePeriod != null && !bucketTimePeriod.isEmpty(), + "MaterializedViewTask requires '%s' to be set on table: %s", + MaterializedViewTask.BUCKET_TIME_PERIOD_KEY, offlineTableName); + long bucketMs = TimeUtils.convertPeriodToMillis(bucketTimePeriod); + String bufferTimePeriod = + taskConfigs.getOrDefault(MaterializedViewTask.BUFFER_TIME_PERIOD_KEY, "0d"); + long bufferMs = TimeUtils.convertPeriodToMillis(bufferTimePeriod); + Preconditions.checkState(bufferMs >= 0, + "bufferTimePeriod must be non-negative for table: %s, got: %s", + offlineTableName, bufferTimePeriod); + + String maxTasksPerBatchStr = taskConfigs.getOrDefault( + MaterializedViewTask.MAX_TASKS_PER_BATCH_KEY, + String.valueOf(MaterializedViewTask.DEFAULT_MAX_TASKS_PER_BATCH)); + int maxTasksPerBatch; + try { + maxTasksPerBatch = Integer.parseInt(maxTasksPerBatchStr); + } catch (NumberFormatException e) { + throw new IllegalStateException( + "Invalid maxTasksPerBatch '" + maxTasksPerBatchStr + "' for table: " + offlineTableName, e); + } + Preconditions.checkState(maxTasksPerBatch >= 1, + "maxTasksPerBatch must be >= 1 for table: %s, got: %s", offlineTableName, maxTasksPerBatch); + + // Resolve the effective LIMIT once via the AST. The user-declared value is used when + // present; otherwise DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT (a bounded cap, not Integer.MAX_VALUE). + // Both values flow downstream: (a) appended to the broker SQL so the broker's silent + // default-LIMIT-of-10 cannot truncate, and (b) stored in EFFECTIVE_LIMIT_KEY so the + // executor's saturation gate fires if a window's result set saturates the cap. + Optional declaredLimit = MaterializedViewAnalyzer.tryExtractDeclaredLimit(definedSQL); + boolean userDeclaredLimit = declaredLimit.isPresent(); + // The analyzer's create-time validation guarantees any present LIMIT is positive + // (and at most MAX_MATERIALIZED_VIEW_QUERY_LIMIT, currently 100_000_000); the orElse default + // is also positive. effectiveLimit > 0 holds. + int defaultLimit = MaterializedViewTaskUtils.readPositiveIntClusterConfigOrDefault( + _context::getClusterConfig, + MaterializedViewTask.CLUSTER_CONFIG_KEY_DEFAULT_QUERY_LIMIT, + MaterializedViewTask.DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT); + int effectiveLimit = declaredLimit.orElse(defaultLimit); + + // Load watermarkMs and partitionInfos from MaterializedViewRuntimeMetadata + String viewTableWithType = TableNameBuilder.OFFLINE.tableNameWithType(offlineTableName); + HelixPropertyStore propertyStore = _context.getPropertyStore(); + long watermarkMs = getWatermarkMs(offlineTableName, sourceTableName, bucketMs, definedSQL, taskConfigs); + + Stat rtStat = new Stat(); + MaterializedViewRuntimeMetadata runtime = MaterializedViewRuntimeMetadataUtils.fetchWithVersion( + propertyStore, viewTableWithType, rtStat); + Map partitionInfos = new HashMap<>(); + int runtimeVersion = -1; + if (runtime != null) { + partitionInfos = new HashMap<>(runtime.getPartitions()); + runtimeVersion = rtStat.getVersion(); + } + + // Walk the in-flight task configs ONCE and bucket per-mode. APPEND tasks must not + // starve DELETE/OVERWRITE (and vice versa) — they have independent gates. We also + // track the highest in-flight APPEND windowEnd so we know where to start new ones. + InFlightTaskCounts counts = countInFlightTasks(offlineTableName, taskType, watermarkMs); + + // ── Step 1: Handle STALE partitions ── + // Under Design C there is no separate EXPIRED state. When the scheduler finds a STALE + // partition it re-computes the source fingerprint and dispatches one of: + // - DELETE task: source data is gone (segmentCount == 0) — drop MV segments + remove the + // partition entry from the runtime metadata + // - revert to VALID: fingerprint matches the stored value (false positive STALE marking) + // - OVERWRITE task: source changed — re-materialize the partition + if (counts.exclusiveModeCount() > 0) { + LOGGER.debug("Found {} in-flight DELETE/{} OVERWRITE tasks for table: {}; skipping", + counts._inFlightDeleteCount.get(), counts._inFlightOverwriteCount.get(), offlineTableName); + } else { + PinotTaskConfig staleTask = tryHandleStalePartition(offlineTableName, sourceTableName, + sourceTableWithType, definedSQL, taskConfigs, partitionInfos, bucketMs, + effectiveLimit, userDeclaredLimit); + if (staleTask != null) { + pinotTaskConfigs.add(staleTask); + LOGGER.info("Generated {} task for table: {}", + staleTask.getConfigs().get(MaterializedViewTask.TASK_MODE_KEY), offlineTableName); + continue; + } + } + + // ── Step 2: Append new data — schedule up to maxTasksPerBatch windows ── + int inFlightAppend = counts._inFlightAppendCount.get(); + int availableSlots = maxTasksPerBatch - inFlightAppend; + if (availableSlots <= 0) { + LOGGER.debug("MV table {} already has {} in-flight APPEND tasks (max={}); skipping", + offlineTableName, inFlightAppend, maxTasksPerBatch); + continue; + } + // Start scheduling from the max of (a) highest in-flight APPEND windowEnd and + // (b) highest contiguous VALID upper from partitionInfos. (b) prevents replay of + // already-VALID windows when a previous batch had a mid-batch failure that left + // some windows VALID and others FAILED — the in-flight set is then empty but + // some windows past `watermarkMs` are already done. + long contiguousValidUpper = MaterializedViewTaskUtils.computeContiguousUpperMs( + watermarkMs, partitionInfos, bucketMs); + long maxInFlightAppendWindowEndMs = + Math.max(counts._maxInFlightAppendWindowEndMs.get(), contiguousValidUpper); + + // Start scheduling from the end of the highest in-flight window to avoid duplicates. + // Floor-align to bucketMs in case an in-flight task was scheduled under a different + // bucketTimePeriod (operator changed config mid-flight); the partition map is keyed by + // aligned starts, so a misaligned nextWindowStartMs would miss VALID-skip lookups. + long nextWindowStartMs = Math.floorDiv(maxInFlightAppendWindowEndMs, bucketMs) * bucketMs; + long cutoffMs = System.currentTimeMillis() - bufferMs; + int scheduled = 0; + + // Hard cap on iterations to defend against pathological partition maps. The loop + // advances nextWindowStartMs by bucketMs each iteration, so a finite cutoff bounds it, + // but skipping VALID slots could still in principle iterate forever if the cutoff is + // far in the future. Cap at availableSlots + partitionInfos.size() — any more is a bug. + // Also clamp at DEFAULT_MAX_BATCH_LOOP_ITERATIONS (or its cluster-config override) so an MV + // table accumulating years of VALID partitions doesn't burn unbounded CPU per cycle. + int maxBatchLoopIterations = MaterializedViewTaskUtils.readPositiveIntClusterConfigOrDefault( + _context::getClusterConfig, + MaterializedViewTask.CLUSTER_CONFIG_KEY_MAX_BATCH_LOOP_ITERATIONS, + DEFAULT_MAX_BATCH_LOOP_ITERATIONS); + int maxIterations = Math.min(availableSlots + partitionInfos.size(), maxBatchLoopIterations); + int iterations = 0; + while (scheduled < availableSlots && iterations < maxIterations) { + iterations++; + long windowEndMs = nextWindowStartMs + bucketMs; + if (windowEndMs > cutoffMs) { + break; + } + // Skip already-VALID slots from a prior partial batch (mid-batch failure recovery). + // Re-running an APPEND for a VALID partition would produce duplicate segments. + PartitionInfo existing = partitionInfos.get(nextWindowStartMs); + if (existing != null && existing.getState() == PartitionState.VALID) { + nextWindowStartMs = windowEndMs; + continue; + } + PinotTaskConfig appendTask = buildTaskConfig(offlineTableName, sourceTableName, + sourceTableWithType, definedSQL, taskConfigs, nextWindowStartMs, windowEndMs, + MaterializedViewTask.TASK_MODE_APPEND, effectiveLimit, userDeclaredLimit); + pinotTaskConfigs.add(appendTask); + LOGGER.info("Generated APPEND task for table: {} window [{}, {})", offlineTableName, + nextWindowStartMs, windowEndMs); + nextWindowStartMs = windowEndMs; + scheduled++; + } + // Surface the cap-hit case (scheduler ran out of iterations before filling availableSlots) + // so a corrupt partition map doesn't silently masquerade as "caught up". + if (iterations >= maxIterations && scheduled < availableSlots) { + LOGGER.error("MV table {} APPEND scheduler hit maxIterations cap ({}); partition map " + + "may be corrupted (size={}, scheduled={}). Investigate stale VALID partitions.", + offlineTableName, maxIterations, partitionInfos.size(), scheduled); + } + + if (scheduled == 0) { + LOGGER.debug("MV table {} is caught up (watermark={}), no dirty partitions.", offlineTableName, watermarkMs); + } + } + return pinotTaskConfigs; + } + + /// Step 1: Finds the earliest STALE partition and dispatches the appropriate task. + /// + /// Re-computes the source fingerprint and picks one of: + /// - DELETE task: source data is gone (segmentCount == 0) — task will drop MV segments + /// and remove the partition entry from runtime metadata + /// - revert to VALID in place: fingerprint matches stored value (false positive) + /// - OVERWRITE task: fingerprint differs — re-materialize the partition + /// + /// @return a [PinotTaskConfig] for DELETE or OVERWRITE, or `null` if no actionable + /// STALE partition exists (either none STALE, or the only STALE one was reverted + /// to VALID in place). + private PinotTaskConfig tryHandleStalePartition(String viewTableName, String sourceTableName, + String sourceTableWithType, String definedSQL, Map taskConfigs, + Map partitionInfos, long bucketMs, + int effectiveLimit, boolean userDeclaredLimit) { + long earliestStaleMs = Long.MAX_VALUE; + for (Map.Entry entry : partitionInfos.entrySet()) { + if (entry.getValue().getState() == PartitionState.STALE && entry.getKey() < earliestStaleMs) { + earliestStaleMs = entry.getKey(); + } + } + if (earliestStaleMs == Long.MAX_VALUE) { + return null; + } + + long windowStartMs = earliestStaleMs; + long windowEndMs = windowStartMs + bucketMs; + PartitionInfo staleInfo = partitionInfos.get(earliestStaleMs); + + PartitionFingerprint currentFp = computeWindowFingerprint(sourceTableWithType, windowStartMs, windowEndMs); + + if (currentFp.getSegmentCount() == 0) { + LOGGER.info("STALE partition [{}, {}) base data deleted for table: {}. Generating DELETE task.", + windowStartMs, windowEndMs, viewTableName); + Map configs = new HashMap<>(); + configs.put(MinionConstants.TABLE_NAME_KEY, viewTableName); + configs.put(MaterializedViewTask.WINDOW_START_MS_KEY, String.valueOf(windowStartMs)); + configs.put(MaterializedViewTask.WINDOW_END_MS_KEY, String.valueOf(windowEndMs)); + configs.put(MaterializedViewTask.TASK_MODE_KEY, MaterializedViewTask.TASK_MODE_DELETE); + configs.put(MinionConstants.UPLOAD_URL_KEY, _context.getVipUrl() + "/segments"); + return new PinotTaskConfig(MaterializedViewTask.TASK_TYPE, configs); + } + + if (currentFp.equals(staleInfo.getFingerprint())) { + LOGGER.info("STALE partition [{}, {}) fingerprint matches for table: {}. " + + "Reverting to VALID (false positive).", windowStartMs, windowEndMs, viewTableName); + persistPartitionStateChangeWithRetry(viewTableName, earliestStaleMs, PartitionState.VALID); + return null; + } + + LOGGER.info("Confirmed STALE partition at {} for table: {}. Generating OVERWRITE task for window [{}, {})", + windowStartMs, viewTableName, windowStartMs, windowEndMs); + return buildTaskConfig(viewTableName, sourceTableName, sourceTableWithType, definedSQL, + taskConfigs, windowStartMs, windowEndMs, MaterializedViewTask.TASK_MODE_OVERWRITE, + effectiveLimit, userDeclaredLimit); + } + + /// CAS-retry budget for STALE -> VALID transitions written by the scheduler. + /// The runtime znode is concurrently mutated by the executor (after each task completion) and by + /// the consistency manager (on base table changes). A bounded retry loop converges in practice. + private static final int MAX_PARTITION_STATE_PERSIST_RETRIES = 8; + + /// Persists a STALE -> VALID transition (false-positive recovery) under a CAS retry loop. + /// On each attempt the latest runtime znode is re-fetched, the target partition's state is + /// re-evaluated, and the change is rewritten on top of the current version. This preserves + /// concurrent updates from the executor (watermark advance) and the consistency manager + /// (other partitions' STALE markings). + /// + /// If the partition is no longer STALE on a retry (executor or consistency manager + /// already changed it), the method exits successfully — the desired transition is either + /// already done or no longer applicable to a stale view of the world. + /// + /// If the budget is exhausted, logs ERROR. The next scheduling cycle will retry. + private void persistPartitionStateChangeWithRetry(String viewTableName, long partitionStartMs, + PartitionState newState) { + String viewTableWithType = TableNameBuilder.OFFLINE.tableNameWithType(viewTableName); + Exception lastException = null; + for (int attempt = 0; attempt < MAX_PARTITION_STATE_PERSIST_RETRIES; attempt++) { + Stat stat = new Stat(); + MaterializedViewRuntimeMetadata current = MaterializedViewRuntimeMetadataUtils.fetchWithVersion( + _context.getPropertyStore(), viewTableWithType, stat); + if (current == null) { + LOGGER.warn("Runtime metadata missing for MV table: {} during partition state persist; aborting", + viewTableName); + return; + } + Map currentInfos = current.getPartitions(); + PartitionInfo info = currentInfos.get(partitionStartMs); + if (info == null || info.getState() != PartitionState.STALE) { + LOGGER.info("Partition {} for MV table: {} is no longer STALE on attempt {}; skipping persist", + partitionStartMs, viewTableName, attempt + 1); + return; + } + Map updatedInfos = new HashMap<>(currentInfos); + updatedInfos.put(partitionStartMs, info.withState(newState)); + MaterializedViewRuntimeMetadata updated = new MaterializedViewRuntimeMetadata( + current.getMaterializedViewTableNameWithType(), + current.getWatermarkMs(), + updatedInfos); + try { + MaterializedViewRuntimeMetadataUtils.persist(_context.getPropertyStore(), updated, stat.getVersion()); + LOGGER.info("Persisted partition {} state {} -> {} for MV table: {} on attempt {}", + partitionStartMs, PartitionState.STALE, newState, viewTableName, attempt + 1); + return; + } catch (IllegalStateException e) { + // Writer-side invariant violation surfaced by `validateForPersist` on the freshly-fetched + // runtime. Retrying will not change the underlying state — fail fast so the caller can + // surface the bug instead of burning the retry budget. + LOGGER.error("Aborting CAS retry for MV table {}: writer-side invariant violation " + + "({}). Generator will not retry until the underlying runtime znode is fixed.", + viewTableName, e.getMessage()); + return; + } catch (Exception e) { + lastException = e; + LOGGER.debug("CAS conflict on attempt {} persisting partition {} state for MV table: {}", + attempt + 1, partitionStartMs, viewTableName, e); + } + // Small jittered backoff so a tight CAS race doesn't burn the budget in microseconds + // and starve the competing writers — also gives transient ZK errors a chance to resolve. + try { + Thread.sleep(5L + ThreadLocalRandom.current().nextInt(20)); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + LOGGER.warn("Interrupted while persisting partition {} state for MV table: {}", + partitionStartMs, viewTableName); + return; + } + } + LOGGER.error("Failed to persist partition {} state {} for MV table: {} after {} retries. " + + "Generator will retry on next scheduling cycle. Last exception:", + partitionStartMs, newState, viewTableName, MAX_PARTITION_STATE_PERSIST_RETRIES, lastException); + } + + /// Builds a complete [PinotTaskConfig] for either APPEND or OVERWRITE mode. + /// + /// @param effectiveLimit pre-resolved LIMIT (user-declared, or + /// [MaterializedViewTask#DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT] when absent in `definedSQL`). + /// Same value flows to the broker SQL and to `EFFECTIVE_LIMIT_KEY` for the gate. + /// @param userDeclaredLimit `true` if `definedSQL` already contains a LIMIT clause + /// (AST-detected by caller). When `false`, `effectiveLimit` is appended to the + /// broker SQL here. + private PinotTaskConfig buildTaskConfig(String viewTableName, String sourceTableName, + String sourceTableWithType, String definedSQL, Map taskConfigs, + long windowStartMs, long windowEndMs, String taskMode, int effectiveLimit, + boolean userDeclaredLimit) { + String taskType = MaterializedViewTask.TASK_TYPE; + + PartitionFingerprint windowFingerprint = + computeWindowFingerprint(sourceTableWithType, windowStartMs, windowEndMs); + + // The source time column may use any DateTimeFieldSpec format (TIMESTAMP, INT-days, etc.). + // Convert the window boundaries to the source's native unit so the appended WHERE filter + // compares apples to apples. + DateTimeFieldSpec sourceTimeFieldSpec = resolveSourceTimeFieldSpec(sourceTableName); + String sourceTimeColumn = sourceTimeFieldSpec.getName(); + DateTimeFormatSpec sourceTimeFormat = sourceTimeFieldSpec.getFormatSpec(); + String windowStart = sourceTimeFormat.fromMillisToFormat(windowStartMs); + String windowEnd = sourceTimeFormat.fromMillisToFormat(windowEndMs); + String sqlWithTimeRange = appendTimeRange(definedSQL, sourceTimeColumn, windowStart, windowEnd); + // If the user did not declare LIMIT, append the bounded default so the broker doesn't + // apply its own (small) cluster default. AST-based detection by the caller is reliable — + // a text scan would mis-fire on string literals or comments containing "LIMIT". + if (!userDeclaredLimit) { + sqlWithTimeRange = sqlWithTimeRange.trim(); + if (sqlWithTimeRange.endsWith(";")) { + sqlWithTimeRange = sqlWithTimeRange.substring(0, sqlWithTimeRange.length() - 1).trim(); + } + sqlWithTimeRange = sqlWithTimeRange + " LIMIT " + effectiveLimit; + } + // Defense: re-parse the final broker-bound SQL and verify the LIMIT is syntactically + // active. Catches text-manipulation hazards in either branch — auto-injected LIMIT + // swallowed by a trailing comment, or appendTimeRange's clause-keyword scan corrupting + // SQL that contained those keywords inside string literals. + Optional verifyLimit; + try { + verifyLimit = MaterializedViewAnalyzer.tryExtractDeclaredLimit(sqlWithTimeRange); + } catch (IllegalStateException e) { + // Calcite parse failure (validateSqlSyntax wraps SqlCompilationException as + // IllegalStateException) — surface with MV table context for operator triage. + throw new IllegalStateException("Broker-bound SQL is unparseable for MV table: " + + viewTableName + ". Check definedSQL for syntax issues (trailing comments, unbalanced " + + "quotes, etc). SQL: " + sqlWithTimeRange, e); + } + String observedLimit = verifyLimit.map(String::valueOf).orElse(""); + Preconditions.checkState( + verifyLimit.isPresent() && verifyLimit.get().intValue() == effectiveLimit, + "LIMIT verification failed for MV table: %s. Re-parsed SQL has LIMIT=%s, expected %s. " + + "definedSQL likely contains text (comments, literals) that interfered with SQL " + + "manipulation. SQL: %s", + viewTableName, observedLimit, effectiveLimit, sqlWithTimeRange); + + Map configs = new HashMap<>(); + configs.put(MinionConstants.TABLE_NAME_KEY, viewTableName); + configs.put(MaterializedViewTask.DEFINED_SQL_KEY, sqlWithTimeRange); + configs.put(MaterializedViewTask.WINDOW_START_MS_KEY, String.valueOf(windowStartMs)); + configs.put(MaterializedViewTask.WINDOW_END_MS_KEY, String.valueOf(windowEndMs)); + configs.put(MaterializedViewTask.SOURCE_TABLE_NAME_KEY, sourceTableName); + configs.put(MaterializedViewTask.TASK_MODE_KEY, taskMode); + configs.put(MaterializedViewTask.EFFECTIVE_LIMIT_KEY, String.valueOf(effectiveLimit)); + configs.put(MinionConstants.UPLOAD_URL_KEY, _context.getVipUrl() + "/segments"); + + String maxNumRecords = taskConfigs.get(MaterializedViewTask.MAX_NUM_RECORDS_PER_SEGMENT_KEY); + if (maxNumRecords != null) { + configs.put(MaterializedViewTask.MAX_NUM_RECORDS_PER_SEGMENT_KEY, maxNumRecords); + } + + Map fingerprintMap = new HashMap<>(); + fingerprintMap.put(windowStartMs, windowFingerprint); + configs.put(MaterializedViewTask.PARTITION_FINGERPRINTS_KEY, + PartitionFingerprint.encodeMap(fingerprintMap)); + + return new PinotTaskConfig(taskType, configs); + } + + public void validateTaskConfigs(TableConfig tableConfig, Schema schema, Map taskConfigs) { + MaterializedViewAnalyzer.analyze( + taskConfigs.get(MaterializedViewTask.DEFINED_SQL_KEY), tableConfig, schema, taskConfigs, _context); + } + + /// Resolves the [DateTimeFieldSpec] for the source table's time column. The returned + /// spec provides both the column name and its format for converting ms watermarks to the + /// column's native format (e.g. days since epoch for `1:DAYS:EPOCH`). + private DateTimeFieldSpec resolveSourceTimeFieldSpec(String rawSourceTableName) { + String sourceTableWithType = resolveSourceTableNameWithType(rawSourceTableName); + TableConfig sourceTableConfig = _context.getTableConfig(sourceTableWithType); + Preconditions.checkState(sourceTableConfig != null, + "Source table config not found for: %s", rawSourceTableName); + + String timeColumn = sourceTableConfig.getValidationConfig().getTimeColumnName(); + Preconditions.checkState(timeColumn != null && !timeColumn.isEmpty(), + "Time column not configured for source table: %s", rawSourceTableName); + + Schema sourceSchema = _context.getTableSchema(sourceTableWithType); + Preconditions.checkState(sourceSchema != null, + "Schema not found for source table: %s", rawSourceTableName); + + DateTimeFieldSpec fieldSpec = sourceSchema.getSpecForTimeColumn(timeColumn); + Preconditions.checkState(fieldSpec != null, + "No DateTimeFieldSpec found for time column '%s' in source table: %s", timeColumn, rawSourceTableName); + return fieldSpec; + } + + /// Resolves the MV table's designated time column from its [TableConfig]. The MV + /// side of a split query filters on this column (e.g. `materializedViewTime < watermarkMs`), + /// which may differ from the source time column when the defined SQL renames or buckets + /// the time via a `dateTimeConvert`/`DATETRUNC` expression. + private String resolveMaterializedViewTimeColumn(String viewTableWithType) { + TableConfig viewTableConfig = _context.getTableConfig(viewTableWithType); + Preconditions.checkState(viewTableConfig != null, + "MV table config not found for: %s", viewTableWithType); + + String timeColumn = viewTableConfig.getValidationConfig().getTimeColumnName(); + Preconditions.checkState(timeColumn != null && !timeColumn.isEmpty(), + "Time column not configured for MV table: %s (required for split queries)", viewTableWithType); + return timeColumn; + } + + /// Appends a time-range WHERE clause to the SQL. Window values are raw epoch millis since + /// both base and MV time columns are TIMESTAMP (enforced by [MaterializedViewAnalyzer]). + /// If a WHERE clause already exists, appends with AND; otherwise inserts before GROUP BY / + /// ORDER BY / the trailing semicolon. + /// + /// Keyword scans (`WHERE`, `FROM`, `GROUP BY`, ...) skip text inside single-quoted + /// string literals so a column value like `'WHERE me'` cannot fool the splitter + /// into corrupting the SQL. Identifiers and column names are required to be simple + /// (validated by the analyzer); double-quoted identifiers are treated like string + /// literals (skipped). + static String appendTimeRange(String sql, String timeColumn, String windowStart, String windowEnd) { + Preconditions.checkArgument(sql != null, "SQL must not be null"); + // Validate column name to prevent SQL injection (column names must be simple identifiers). + Preconditions.checkArgument(IDENTIFIER_PATTERN.matcher(timeColumn).matches(), + "Time column name contains invalid characters: %s", timeColumn); + // Reject SQL containing nested SELECTs / subqueries: the keyword splitter below finds the + // FIRST `WHERE` and inserts the time predicate after it, which would attach the predicate to + // the inner subquery's WHERE rather than the outer query's — producing a semantically-wrong + // task SQL. MV definitions are intentionally restricted to flat queries; reject up front + // rather than silently corrupt. Counts standalone SELECT tokens outside string literals + // and comments via the same mask used downstream. + Preconditions.checkArgument(countStandaloneSelectKeywords(sql) <= 1, + "MV definedSQL must not contain a nested SELECT / subquery; got: %s", sql); + + // windowStart/windowEnd come from DateTimeFormatSpec.fromMillisToFormat — numeric for EPOCH + // formats (TIMESTAMP, epoch-days, etc.), or quoted strings for SIMPLE_DATE_FORMAT. Quote + // non-numeric values so the SQL parser treats them as string literals. + String quotedStart = isNumeric(windowStart) ? windowStart : "'" + windowStart + "'"; + String quotedEnd = isNumeric(windowEnd) ? windowEnd : "'" + windowEnd + "'"; + String timeFilter = timeColumn + " >= " + quotedStart + " AND " + timeColumn + " < " + quotedEnd; + + // Remove trailing semicolon for easier manipulation + String trimmed = sql.trim(); + if (trimmed.endsWith(";")) { + trimmed = trimmed.substring(0, trimmed.length() - 1).trim(); + } + + // Use Locale.ROOT — the default locale can change a string's length on + // toUpperCase (e.g. de_DE turns 'ß' into "SS", growing length by one), which + // would misalign the quoteMask and let user-supplied keywords inside string + // literals fool the splitter. + String upperSql = trimmed.toUpperCase(Locale.ROOT); + // Build a parallel mask that marks positions inside single-quoted string literals or + // double-quoted identifiers, so keyword scans skip over user-controlled text. + boolean[] quoteMask = buildQuoteMask(trimmed); + // indexOfKeywordWithBoundary returns the start index of the keyword. + int whereIdx = indexOfKeywordWithBoundary(upperSql, quoteMask, "WHERE", 0); + if (whereIdx >= 0) { + int afterWhere = whereIdx + "WHERE".length(); + int insertPos = findClauseEnd(upperSql, quoteMask, afterWhere); + return trimmed.substring(0, insertPos) + " AND " + timeFilter + trimmed.substring(insertPos); + } + + // No WHERE — insert before GROUP BY / ORDER BY / LIMIT / HAVING / end + int fromIdx = indexOfKeywordWithBoundary(upperSql, quoteMask, "FROM", 0); + Preconditions.checkState(fromIdx >= 0, "definedSQL is missing a FROM clause: %s", sql); + int afterFrom = fromIdx + "FROM".length(); + int insertPos = findClauseEnd(upperSql, quoteMask, afterFrom); + return trimmed.substring(0, insertPos) + " WHERE " + timeFilter + trimmed.substring(insertPos); + } + + private static boolean isNumeric(String value) { + if (value == null || value.isEmpty()) { + return false; + } + for (int i = 0; i < value.length(); i++) { + char c = value.charAt(i); + if (!Character.isDigit(c) && c != '-' && c != '.') { + return false; + } + } + return true; + } + + /// Counts standalone occurrences of the `SELECT` keyword in `sql` outside string literals, + /// double-quoted identifiers, and SQL comments. A flat MV `definedSQL` has exactly one; + /// a value of two or more indicates a nested SELECT / subquery, which is unsupported because + /// the text-based [#appendTimeRange] inserts its time predicate after the FIRST `WHERE` and + /// would attach to the inner query's WHERE rather than the outer query. + static int countStandaloneSelectKeywords(String sql) { + String upper = sql.toUpperCase(Locale.ROOT); + boolean[] mask = buildQuoteMask(sql); + int count = 0; + int from = 0; + while (true) { + int idx = indexOfKeywordWithBoundary(upper, mask, "SELECT", from); + if (idx < 0) { + break; + } + count++; + from = idx + "SELECT".length(); + } + return count; + } + + private static long parseLong(String value, long defaultValue) { + if (value == null || value.isEmpty()) { + return defaultValue; + } + try { + return Long.parseLong(value); + } catch (NumberFormatException e) { + return defaultValue; + } + } + + /// Builds a parallel mask of the SQL where `mask[i] == true` means position `i` + /// is inside text the keyword scanner must skip: + /// + /// - single-quoted string literals (with ANSI doubled-quote escape `''`) + /// - double-quoted identifiers (with ANSI doubled-quote escape `""`) + /// - `--` line comments through end-of-line + /// - `/* ... *``/` block comments + /// + /// Without comment masking a `definedSQL` fragment such as `-- WHERE x` would + /// fool the splitter into treating the comment text as a real WHERE clause. + private static boolean[] buildQuoteMask(String sql) { + boolean[] mask = new boolean[sql.length()]; + char active = 0; // 0 means outside any quoted region; otherwise the quote char. + int i = 0; + int len = sql.length(); + while (i < len) { + char c = sql.charAt(i); + if (active == 0) { + if (c == '-' && i + 1 < len && sql.charAt(i + 1) == '-') { + // -- line comment: mask through end-of-line (or end-of-string). + int eol = sql.indexOf('\n', i + 2); + int end = eol < 0 ? len : eol; + for (int j = i; j < end; j++) { + mask[j] = true; + } + i = end; + } else if (c == '/' && i + 1 < len && sql.charAt(i + 1) == '*') { + // /* ... */ block comment: mask through closing */ (or end-of-string). + int close = sql.indexOf("*/", i + 2); + int end = close < 0 ? len : close + 2; + for (int j = i; j < end; j++) { + mask[j] = true; + } + i = end; + } else if (c == '\'' || c == '"') { + active = c; + mask[i] = true; + i++; + } else { + i++; + } + } else { + mask[i] = true; + if (c == active) { + // ANSI doubled-quote escape: '' or "" stays inside the literal. + if (i + 1 < len && sql.charAt(i + 1) == active) { + mask[i + 1] = true; + i += 2; + } else { + active = 0; + i++; + } + } else { + i++; + } + } + } + return mask; + } + + /// Finds the next standalone occurrence of `keyword` in `upperSql` starting + /// at `fromIdx`, requiring whitespace boundaries on both sides (or string edge) so + /// a column or alias containing the keyword as a substring cannot match. Skips any + /// positions covered by `quoteMask`. Returns the start index of the keyword + /// itself, or -1 if not found. Callers that want to insert *before* the keyword (and + /// preserve the leading whitespace) should use the returned index minus 1 — except + /// when the keyword is at index 0, in which case there is no preceding boundary char. + private static int indexOfKeywordWithBoundary(String upperSql, boolean[] quoteMask, String keyword, + int fromIdx) { + int idx = fromIdx; + int len = upperSql.length(); + while (idx <= len - keyword.length()) { + int found = upperSql.indexOf(keyword, idx); + if (found < 0) { + return -1; + } + int end = found + keyword.length(); + boolean leftBoundary = found == 0 || isSqlBoundary(upperSql.charAt(found - 1)); + boolean rightBoundary = end == len || isSqlBoundary(upperSql.charAt(end)); + if (leftBoundary && rightBoundary) { + // Reject if any character of the keyword overlaps a quoted/comment region. + boolean inMasked = false; + for (int j = 0; j < keyword.length(); j++) { + if (quoteMask[found + j]) { + inMasked = true; + break; + } + } + if (!inMasked) { + return found; + } + } + idx = found + 1; + } + return -1; + } + + private static boolean isSqlBoundary(char c) { + return Character.isWhitespace(c) || c == '(' || c == ')' || c == ',' || c == ';'; + } + + /// Finds the position immediately before the next major SQL clause keyword (GROUP, ORDER, + /// HAVING, LIMIT) starting from `fromIdx`, skipping over quoted regions and + /// comments. Returns the index of the boundary char preceding the keyword (so a + /// substring-split at this position preserves the whitespace), or the end of the string + /// if no keyword is found. Match is by whitespace boundary on both sides so a keyword + /// preceded by a newline (e.g. after a line comment) is still detected. + private static int findClauseEnd(String upperSql, boolean[] quoteMask, int fromIdx) { + String[] keywords = {"GROUP", "ORDER", "HAVING", "LIMIT"}; + int minIdx = upperSql.length(); + for (String keyword : keywords) { + int kw = indexOfKeywordWithBoundary(upperSql, quoteMask, keyword, fromIdx); + // Adjust to the boundary char preceding the keyword. The keyword cannot legitimately + // appear at index 0 of a trimmed SQL (which starts with SELECT), so kw == 0 is + // unreachable here; defend against it just in case. + int idx = (kw < 0) ? -1 : (kw == 0 ? 0 : kw - 1); + if (idx >= 0 && idx < minIdx) { + minIdx = idx; + } + } + return minIdx; + } + + /// Per-mode in-flight task counts for one MV table, derived from the live task configs. + /// APPEND uses the count to enforce `maxTasksPerBatch`. DELETE and OVERWRITE each + /// gate themselves at "single concurrent task" — and they are also mutually exclusive with + /// each other (both touch existing MV segments via segment-replace), so each step checks + /// both counts. + /// + /// Atomic fields are used defensively: context implementations usually iterate synchronously, + /// but a future change to async iteration would otherwise silently miscount and let + /// DELETE/OVERWRITE schedule simultaneously, double-replacing segments. + private static final class InFlightTaskCounts { + final AtomicInteger _inFlightAppendCount = new AtomicInteger(); + final AtomicInteger _inFlightDeleteCount = new AtomicInteger(); + final AtomicInteger _inFlightOverwriteCount = new AtomicInteger(); + final AtomicLong _maxInFlightAppendWindowEndMs; + + InFlightTaskCounts(long initialMaxAppendWindowEndMs) { + _maxInFlightAppendWindowEndMs = new AtomicLong(initialMaxAppendWindowEndMs); + } + + int exclusiveModeCount() { + return _inFlightDeleteCount.get() + _inFlightOverwriteCount.get(); + } + } + + /// Walks the live task configs for `offlineTableName` once and buckets per-mode. + /// Required because OVERWRITE/DELETE and APPEND must have independent gates — sharing + /// a single "incompleteTasks" check made an in-flight APPEND silently block any DELETE + /// of a stale partition. + private InFlightTaskCounts countInFlightTasks(String offlineTableName, String taskType, + long watermarkMs) { + InFlightTaskCounts counts = new InFlightTaskCounts(watermarkMs); + _context.forRunningTasks(offlineTableName, taskType, cfg -> { + String mode = cfg.get(MaterializedViewTask.TASK_MODE_KEY); + Preconditions.checkState(mode != null && !mode.isEmpty(), + "In-flight task missing %s for table: %s — buildTaskConfig must always set the mode", + MaterializedViewTask.TASK_MODE_KEY, offlineTableName); + if (MaterializedViewTask.TASK_MODE_APPEND.equals(mode)) { + String windowEndStr = cfg.get(MaterializedViewTask.WINDOW_END_MS_KEY); + Preconditions.checkState(windowEndStr != null, + "In-flight APPEND task missing %s for table: %s — buildTaskConfig must always set it", + MaterializedViewTask.WINDOW_END_MS_KEY, offlineTableName); + long end = Long.parseLong(windowEndStr); + counts._maxInFlightAppendWindowEndMs.accumulateAndGet(end, Math::max); + counts._inFlightAppendCount.incrementAndGet(); + } else if (MaterializedViewTask.TASK_MODE_DELETE.equals(mode)) { + counts._inFlightDeleteCount.incrementAndGet(); + } else if (MaterializedViewTask.TASK_MODE_OVERWRITE.equals(mode)) { + counts._inFlightOverwriteCount.incrementAndGet(); + } else { + // Forward-compat: unknown mode (e.g. a future REBUILD/BACKFILL added by a newer + // controller). Don't classify it as APPEND — that would silently miscount. Warn so + // operators see the version mismatch. + LOGGER.warn("Unknown task mode '{}' for in-flight task in table: {}; ignoring for " + + "scheduling decisions", mode, offlineTableName); + } + }); + return counts; + } + + /// Reads the scheduling watermark from MaterializedViewRuntimeMetadata or initialises it + /// on cold-start by finding the minimum segment start time from the source table + /// and aligning it to the bucket boundary. + /// + /// On cold-start, `watermarkMs` is 0 and the partitions map is empty, so the broker + /// will not attempt split queries against the empty MV table. The first successful APPEND + /// (via the executor) will advance `watermarkMs` and add the partition entry. + @VisibleForTesting + long getWatermarkMs(String viewTableName, String sourceTableName, long bucketMs, String definedSQL, + Map taskConfigs) { + String viewTableWithType = TableNameBuilder.OFFLINE.tableNameWithType(viewTableName); + HelixPropertyStore propertyStore = _context.getPropertyStore(); + MaterializedViewRuntimeMetadata runtime = + MaterializedViewRuntimeMetadataUtils.fetch(propertyStore, viewTableWithType); + + if (runtime != null) { + return runtime.getWatermarkMs(); + } + + // Cold-start: find the earliest segment start time from the source table + String sourceTableWithType = resolveSourceTableNameWithType(sourceTableName); + List segmentsZKMetadata = _context.getSegmentsZKMetadata(sourceTableWithType); + + long minStartTimeMs = Long.MAX_VALUE; + for (SegmentZKMetadata segmentZKMetadata : segmentsZKMetadata) { + long startTimeMs = segmentZKMetadata.getStartTimeMs(); + if (startTimeMs >= 0) { + minStartTimeMs = Math.min(minStartTimeMs, startTimeMs); + } + } + Preconditions.checkState(minStartTimeMs != Long.MAX_VALUE, + "No valid segments found in source table: %s for cold-start watermark", sourceTableName); + + long watermarkMs = Math.floorDiv(minStartTimeMs, bucketMs) * bucketMs; + + // Empty partitions map on cold-start: the broker treats every bucket as "not covered" + // until the first APPEND populates the map. Freshness is now derived on read from + // (now - watermarkMs) against the per-table staleness SLO. + MaterializedViewRuntimeMetadata newRuntime = new MaterializedViewRuntimeMetadata( + viewTableWithType, watermarkMs, new HashMap<>()); + // Create-if-absent: two scheduler runs racing on cold-start would otherwise blind-clobber + // each other. If a concurrent writer already created the znode, fall back to fetching + // their value rather than overwriting — their persisted state may already include + // updates from the consistency manager / executor. + if (MaterializedViewRuntimeMetadataUtils.createIfAbsent(propertyStore, newRuntime)) { + LOGGER.info("Cold-start: initialized MaterializedViewRuntimeMetadata with watermark {} for MV table: {}", + watermarkMs, viewTableWithType); + } else { + MaterializedViewRuntimeMetadata existing = + MaterializedViewRuntimeMetadataUtils.fetch(propertyStore, viewTableWithType); + if (existing != null) { + LOGGER.info("Cold-start: another writer already initialized MV runtime metadata for {} " + + "(watermark={}); using existing values.", viewTableWithType, existing.getWatermarkMs()); + watermarkMs = existing.getWatermarkMs(); + } + } + + // Initialize MaterializedViewDefinitionMetadata with base table info and partition expression maps + Schema viewSchema = _context.getTableSchema(viewTableWithType); + Map partitionExprMaps = (viewSchema != null) + ? MaterializedViewAnalyzer.extractPartitionExprMaps(definedSQL, viewSchema) + : new HashMap<>(); + + // Resolve split spec from both the source and MV tables' time columns. The MV column + // is enforced TIMESTAMP by MaterializedViewAnalyzer (no format needed there); the base + // column may use any format, so we persist its `DateTimeFieldSpec.getFormat()` for the + // broker's base-side filter conversion. + DateTimeFieldSpec sourceTimeFieldSpec = resolveSourceTimeFieldSpec(sourceTableName); + String sourceTimeColumn = sourceTimeFieldSpec.getName(); + String sourceTimeFormat = sourceTimeFieldSpec.getFormat(); + String viewTimeColumn = resolveMaterializedViewTimeColumn(viewTableWithType); + MaterializedViewSplitSpec splitSpec = + new MaterializedViewSplitSpec(sourceTimeColumn, sourceTimeFormat, viewTimeColumn, bucketMs); + + long stalenessThresholdMs = parseLong( + taskConfigs.get(MaterializedViewTask.STALENESS_THRESHOLD_MS_KEY), + MaterializedViewTask.DEFAULT_STALENESS_THRESHOLD_MS); + MaterializedViewDefinitionMetadata definition = new MaterializedViewDefinitionMetadata( + viewTableWithType, + Collections.singletonList(sourceTableName), + definedSQL, + partitionExprMaps, + splitSpec, + stalenessThresholdMs, + /*rewriteEnabled=*/ true); + // Create-if-absent so a racing scheduler run cannot clobber definition metadata that may + // have diverged due to a concurrent schema update (different `partitionExprMaps` / split + // spec). If the znode already exists, the controller-side definition writer (or another + // scheduler) already wrote it; we leave that authoritative copy in place. + if (MaterializedViewDefinitionMetadataUtils.createIfAbsent(propertyStore, definition)) { + LOGGER.info("Cold-start: initialized MaterializedViewDefinitionMetadata for MV table: {} with source table: {}", + viewTableWithType, sourceTableName); + } else { + LOGGER.info("Cold-start: MaterializedViewDefinitionMetadata for MV table: {} already exists; " + + "leaving authoritative copy in place.", viewTableWithType); + } + + return watermarkMs; + } + + /// Resolves the source table name with type suffix. Tries OFFLINE first, then REALTIME. + private String resolveSourceTableNameWithType(String rawSourceTableName) { + String sourceTableWithType = TableNameBuilder.OFFLINE.tableNameWithType(rawSourceTableName); + TableConfig sourceTableConfig = _context.getTableConfig(sourceTableWithType); + if (sourceTableConfig != null) { + return sourceTableWithType; + } + sourceTableWithType = TableNameBuilder.REALTIME.tableNameWithType(rawSourceTableName); + sourceTableConfig = _context.getTableConfig(sourceTableWithType); + Preconditions.checkState(sourceTableConfig != null, + "Source table config not found for: %s", rawSourceTableName); + return sourceTableWithType; + } + + /// Computes a [PartitionFingerprint] by fetching segments from ZK. + private PartitionFingerprint computeWindowFingerprint(String sourceTableWithType, + long windowStartMs, long windowEndMs) { + return computeWindowFingerprint(_context.getSegmentsZKMetadata(sourceTableWithType), + windowStartMs, windowEndMs); + } + + /// Computes a [PartitionFingerprint] for the given time window from pre-fetched + /// segment metadata. + /// + /// The fingerprint is `Hashing.farmHashFingerprint64` over the sorted concatenation of + /// `\0\n` lines. Sorting makes the hash insensitive to listing order; + /// FarmHash64 is non-cryptographic but collision-resistant for non-adversarial inputs. + /// Replaces a previous XOR-CRC scheme that exhibited cancellation collisions (swap two + /// segments with the same combined contribution → identical fingerprint). + private PartitionFingerprint computeWindowFingerprint(List allSegments, + long windowStartMs, long windowEndMs) { + List overlapping = new ArrayList<>(); + for (SegmentZKMetadata seg : allSegments) { + long segStartMs = seg.getStartTimeMs(); + long segEndMs = seg.getEndTimeMs(); + if (segStartMs < windowEndMs && segEndMs >= windowStartMs) { + overlapping.add(seg); + } + } + overlapping.sort(Comparator.comparing(SegmentZKMetadata::getSegmentName)); + + Hasher hasher = Hashing.farmHashFingerprint64().newHasher(); + for (SegmentZKMetadata seg : overlapping) { + hasher.putString(seg.getSegmentName(), StandardCharsets.UTF_8); + hasher.putByte((byte) 0); + hasher.putLong(seg.getCrc()); + hasher.putByte((byte) '\n'); + } + long crcFingerprint = hasher.hash().asLong(); + LOGGER.info("Computed partition fingerprint for window [{}, {}): segmentCount={}, crcFingerprint={}", + windowStartMs, windowEndMs, overlapping.size(), crcFingerprint); + return new PartitionFingerprint(overlapping.size(), crcFingerprint); + } +} diff --git a/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskUtils.java b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskUtils.java new file mode 100644 index 000000000000..dd8f4f2ed564 --- /dev/null +++ b/pinot-materialized-view/src/main/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskUtils.java @@ -0,0 +1,175 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.scheduler; + +import com.google.common.base.Preconditions; +import java.util.Map; +import java.util.function.Function; +import javax.annotation.Nullable; +import org.apache.pinot.materializedview.metadata.PartitionInfo; +import org.apache.pinot.materializedview.metadata.PartitionState; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/// Shared materialized-view task helpers used by the scheduler and minion executor wiring. +public final class MaterializedViewTaskUtils { + private static final Logger LOGGER = LoggerFactory.getLogger(MaterializedViewTaskUtils.class); + + private MaterializedViewTaskUtils() { + } + + /// Reads a positive-integer cluster-config override and falls back to {@code defaultValue} + /// when the key is unset, malformed, or non-positive. Use for caps that must be reloadable + /// at runtime without a controller / minion restart — callers MUST invoke this on every + /// consumer-site call rather than caching the result. + public static int readPositiveIntClusterConfigOrDefault( + @Nullable Function clusterConfigLookup, String configKey, int defaultValue) { + if (clusterConfigLookup == null) { + return defaultValue; + } + String raw = clusterConfigLookup.apply(configKey); + if (raw == null || raw.isEmpty()) { + return defaultValue; + } + try { + int parsed = Integer.parseInt(raw.trim()); + if (parsed <= 0) { + LOGGER.warn("Cluster config '{}'='{}' is non-positive; falling back to default {}", + configKey, raw, defaultValue); + return defaultValue; + } + return parsed; + } catch (NumberFormatException e) { + LOGGER.warn("Cluster config '{}'='{}' is not a valid integer; falling back to default {}", + configKey, raw, defaultValue); + return defaultValue; + } + } + + /// Same as {@link #readPositiveIntClusterConfigOrDefault} for `long` values (e.g. millis). + public static long readPositiveLongClusterConfigOrDefault( + @Nullable Function clusterConfigLookup, String configKey, long defaultValue) { + if (clusterConfigLookup == null) { + return defaultValue; + } + String raw = clusterConfigLookup.apply(configKey); + if (raw == null || raw.isEmpty()) { + return defaultValue; + } + try { + long parsed = Long.parseLong(raw.trim()); + if (parsed <= 0L) { + LOGGER.warn("Cluster config '{}'='{}' is non-positive; falling back to default {}", + configKey, raw, defaultValue); + return defaultValue; + } + return parsed; + } catch (NumberFormatException e) { + LOGGER.warn("Cluster config '{}'='{}' is not a valid long; falling back to default {}", + configKey, raw, defaultValue); + return defaultValue; + } + } + + /// Reads a cluster-config value via the supplied lookup; returns null if the lookup is null + /// or returns null. Tolerates a null lookup for unit tests that don't wire a context. + @Nullable + public static String readClusterConfig( + @Nullable Function clusterConfigLookup, String configKey) { + return clusterConfigLookup == null ? null : clusterConfigLookup.apply(configKey); + } + + /// Returns the highest contiguous VALID upper boundary starting from `fromMs`. + public static long computeContiguousUpperMs(long fromMs, Map partitions, long bucketMs) { + Preconditions.checkArgument(bucketMs > 0, "bucketMs must be positive, got: %s", bucketMs); + long cursor = fromMs; + int maxIterations = partitions.size(); + for (int i = 0; i < maxIterations; i++) { + PartitionInfo info = partitions.get(cursor); + if (info == null || info.getState() != PartitionState.VALID) { + return cursor; + } + cursor += bucketMs; + } + return cursor; + } + + /// Parses and validates the `EFFECTIVE_LIMIT_KEY` task-config value. Throws with an actionable + /// message when missing, malformed, or non-positive — the executor's saturation gate cannot + /// run without a valid effective limit, so failing loud beats silent truncation. + public static int parseEffectiveLimit(Map configs, String tableName) { + String limitStr = configs.get(MaterializedViewTask.EFFECTIVE_LIMIT_KEY); + if (limitStr == null || limitStr.isEmpty()) { + LOGGER.error("Missing {} in task config for table: {}. " + + "Saturation gate cannot be silently skipped - upgrade the controller and retry.", + MaterializedViewTask.EFFECTIVE_LIMIT_KEY, tableName); + throw new IllegalStateException("Missing " + MaterializedViewTask.EFFECTIVE_LIMIT_KEY + + " in task config for table: " + tableName); + } + int effectiveLimit; + try { + effectiveLimit = Integer.parseInt(limitStr); + } catch (NumberFormatException e) { + throw new IllegalStateException( + "Invalid " + MaterializedViewTask.EFFECTIVE_LIMIT_KEY + " '" + limitStr + + "' in task config for table: " + tableName, e); + } + if (effectiveLimit <= 0) { + LOGGER.error("Non-positive effectiveLimit {} in task config for table: {}", + effectiveLimit, tableName); + throw new IllegalStateException( + "effectiveLimit must be positive for table: " + tableName + ", got: " + effectiveLimit); + } + return effectiveLimit; + } + + /// Throws the saturation-gate failure with an operator-actionable message. + public static void failOnSaturation(String tableName, long windowStartMs, long windowEndMs, + long actualRows, int effectiveLimit) { + String message = String.format( + "MV result saturated LIMIT: table=%s, window=[%d, %d), rows=%d, LIMIT=%d. " + + "The materialized window is likely incomplete; failing the task to prevent " + + "marking this partition VALID with truncated data. Narrow the time bucket / " + + "filters in definedSQL, or add/raise the LIMIT clause in definedSQL.", + tableName, windowStartMs, windowEndMs, actualRows, effectiveLimit); + LOGGER.error(message); + throw new IllegalStateException(message); + } + + /// Fails the task if the query result set saturated the declared `LIMIT`, since that + /// strongly suggests the window was truncated and the resulting MV would be incomplete. + /// Delegates to [#parseEffectiveLimit] + [#failOnSaturation] so the production streaming + /// path and the @VisibleForTesting helper share one implementation. + public static void verifyResultNotTruncated(Map configs, String tableName, + long windowStartMs, long windowEndMs, int actualRows) { + int effectiveLimit = parseEffectiveLimit(configs, tableName); + if (actualRows >= effectiveLimit) { + failOnSaturation(tableName, windowStartMs, windowEndMs, actualRows, effectiveLimit); + } + } + + /// Builds a segment name that is stable within a single attempt but unique across retries of the + /// same window. + public static String buildSegmentName(String tableName, long windowStartMs, long windowEndMs, + String attemptId, int segIdx) { + return tableName + "_" + windowStartMs + "_" + windowEndMs + "_" + attemptId + "_" + segIdx; + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzerTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzerTest.java new file mode 100644 index 000000000000..dd76834650b6 --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/MaterializedViewAnalyzerTest.java @@ -0,0 +1,1108 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.analysis; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import javax.annotation.Nullable; +import org.apache.pinot.materializedview.context.MaterializedViewTaskGeneratorContext; +import org.apache.pinot.spi.config.table.DedupConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.config.table.UpsertConfig; +import org.apache.pinot.spi.config.table.ingestion.BatchIngestionConfig; +import org.apache.pinot.spi.config.table.ingestion.IngestionConfig; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + + +public class MaterializedViewAnalyzerTest { + + private static final String SOURCE_TABLE = "orders"; + private static final String SOURCE_TABLE_OFFLINE = "orders_OFFLINE"; + private static final String TIME_COLUMN = "DaysSinceEpoch"; + /// Appended to every test SQL that is expected to reach validations beyond the LIMIT check. + private static final String DEFAULT_LIMIT = " LIMIT 1000"; + + private MaterializedViewTaskGeneratorContext _mockAccessor; + private TableConfig _sourceTableConfig; + private Schema _sourceSchema; + + @BeforeMethod + public void setUp() { + _mockAccessor = mock(MaterializedViewTaskGeneratorContext.class); + + _sourceTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(SOURCE_TABLE_OFFLINE) + .setTimeColumnName(TIME_COLUMN) + .build(); + + _sourceSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addSingleValueDimension("status", FieldSpec.DataType.STRING) + .addMetric("amount", FieldSpec.DataType.DOUBLE) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + stubTable(SOURCE_TABLE_OFFLINE, _sourceTableConfig, _sourceSchema); + } + + /// Stubs `tableExists`, `getTableConfig`, and `getTableSchema` in lock-step so callers don't + /// have to remember the three calls. Pass `null` for the config or schema when the test + /// specifically wants the absence case (e.g. probing a non-existent variant). + private void stubTable(String tableNameWithType, @Nullable TableConfig config, @Nullable Schema schema) { + boolean exists = config != null; + when(_mockAccessor.tableExists(tableNameWithType)).thenReturn(exists); + if (exists) { + when(_mockAccessor.getTableConfig(tableNameWithType)).thenReturn(config); + } else { + when(_mockAccessor.getTableConfig(tableNameWithType)) + .thenThrow(new IllegalStateException("Table config not found for: " + tableNameWithType)); + } + if (schema != null) { + when(_mockAccessor.getTableSchema(tableNameWithType)).thenReturn(schema); + } else if (exists) { + // Table exists but no schema: matches the "cluster-state inconsistency" branch. + when(_mockAccessor.getTableSchema(tableNameWithType)) + .thenThrow(new IllegalStateException("Schema not found for table: " + tableNameWithType)); + } + } + + // ----------------------------------------------------------------------- + // Happy path + // ----------------------------------------------------------------------- + + @Test + public void testValidSqlWithMatchingSchema() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt, sum(amount) AS total_amount " + + "FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addMetric("total_amount", FieldSpec.DataType.DOUBLE) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + + assertNotNull(result); + assertEquals(result.getSourceTableName(), SOURCE_TABLE); + assertTrue(result.getSelectFields().contains("city")); + assertTrue(result.getSelectFields().contains("cnt")); + assertTrue(result.getSelectFields().contains("total_amount")); + assertTrue(result.getSelectFields().contains(TIME_COLUMN)); + assertEquals(result.getSelectFields().size(), 4); + + // Verify partitionExprMaps + assertNotNull(result.getPartitionExprMaps()); + assertEquals(result.getPartitionExprMaps().size(), 1); + assertEquals(result.getPartitionExprMaps().get(TIME_COLUMN), TIME_COLUMN); + } + + @Test + public void testValidSqlBareColumnsOnly() { + String sql = "SELECT DaysSinceEpoch, city, status FROM orders"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addSingleValueDimension("status", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + + assertNotNull(result); + assertEquals(result.getSelectFields().size(), 3); + assertEquals(result.getPartitionExprMaps().get(TIME_COLUMN), TIME_COLUMN); + } + + @Test + public void testValidSqlWithTimeTransformFunction() { + String sql = "SELECT DATETRUNC('DAY', DaysSinceEpoch) AS dayBucket, city, count(*) AS cnt " + + "FROM orders GROUP BY DATETRUNC('DAY', DaysSinceEpoch), city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime("dayBucket", FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + // The MV renames the time column via DATETRUNC, so segmentsConfig.timeColumnName must + // point to the SELECT alias 'dayBucket' — not the inherited base name. + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("dayBucket") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + + assertNotNull(result); + assertEquals(result.getPartitionExprMaps().size(), 1); + assertEquals(result.getPartitionExprMaps().get("datetrunc('DAY', DaysSinceEpoch)"), "dayBucket"); + } + + @Test + public void testTimeColumnMissingFromSelect() { + String sql = "SELECT city, count(*) AS cnt FROM orders GROUP BY city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "is not produced by any SELECT expression"); + } + + @Test + public void testTimeColumnMissingFromGroupBy() { + // Calcite enforces that non-aggregated SELECT columns must appear in GROUP BY, + // so this SQL fails at syntax validation with Calcite's own error message. + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "functionally dependent"); + } + + // ----------------------------------------------------------------------- + // Step 1: SQL syntax errors + // ----------------------------------------------------------------------- + + @Test + public void testInvalidSqlSyntax() { + String sql = "SELCT city FROM orders"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .build(); + + expectError(sql, viewSchema, "Invalid SQL syntax"); + } + + @Test + public void testNullSql() { + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .build(); + + expectError(null, viewSchema, "definedSQL must be specified"); + } + + @Test + public void testEmptySql() { + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .build(); + + expectError("", viewSchema, "definedSQL must be specified"); + } + + // ----------------------------------------------------------------------- + // Step 1b: LIMIT is optional; when present it must be strictly positive + // ----------------------------------------------------------------------- + + @Test + public void testMissingLimitAllowed() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + + // No LIMIT is now allowed — truncation check is simply disabled. + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(sql, viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + assertNotNull(result); + } + + @Test + public void testExplicitLimitAllowed() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city " + + "LIMIT 10000"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(sql, viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + assertNotNull(result); + } + + @Test + public void testZeroLimitRejected() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city LIMIT 0"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + expectErrorRaw(sql, viewSchema, taskConfigs, "LIMIT must be strictly positive"); + } + + @Test + public void testLargeLimitAccepted() { + // MAX_MATERIALIZED_VIEW_QUERY_LIMIT = 100_000_000: large user-declared LIMITs up to the cap + // are honored. The saturation gate still fires when a window's actual row count reaches + // the declared LIMIT, so users opt into the truncation guarantee at their chosen LIMIT. + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city LIMIT 99000000"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(sql, viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + assertNotNull(result); + } + + @Test + public void testLimitAboveCapRejected() { + // Per MAJOR fix: LIMITs above MAX_MATERIALIZED_VIEW_QUERY_LIMIT (100M) must be rejected at + // create time so the executor cannot accumulate that many rows in memory before the + // saturation gate fires. + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city LIMIT 200000000"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + expectErrorRaw(sql, viewSchema, taskConfigs, "exceeds maximum 100000000"); + } + + @Test + public void testOffsetRejected() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city LIMIT 100 OFFSET 50"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + expectErrorRaw(sql, viewSchema, taskConfigs, "must not declare OFFSET"); + } + + @Test + public void testTrailingLineCommentBlocksAutoLimitInjection() { + // No LIMIT in definedSQL + trailing line comment would swallow the auto-injected LIMIT + // at task-generation time. Analyzer simulates the append at create time and rejects. + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city -- daily aggregation"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + expectErrorRaw(sql, viewSchema, taskConfigs, "swallow the auto-injected LIMIT"); + } + + @Test + public void testTrailingUnterminatedBlockCommentRejected() { + // An unterminated block comment fails Calcite's initial SQL syntax check (Step 1) — + // the analyzer surfaces this at create time before the auto-inject probe even runs. + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city /* trailing"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + expectErrorRaw(sql, viewSchema, taskConfigs, "Invalid SQL syntax"); + } + + @Test + public void testTrailingTerminatedBlockCommentAllowed() { + // A terminated block comment is stripped by Calcite before parsing — the auto-inject + // probe sees clean SQL ending in the GROUP BY clause and successfully verifies the LIMIT. + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city /* trailing comment */"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(sql, viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + assertNotNull(result); + } + + @Test + public void testNegativeBufferRejected() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders " + + "GROUP BY DaysSinceEpoch, city LIMIT 100"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + taskConfigs.put(MaterializedViewTask.BUFFER_TIME_PERIOD_KEY, "-1d"); + expectErrorRaw(sql, viewSchema, taskConfigs, "bufferTimePeriod"); + } + + @Test + public void testTryExtractDeclaredLimitPresent() { + String sql = "SELECT DaysSinceEpoch, city FROM orders LIMIT 2500"; + assertEquals(MaterializedViewAnalyzer.tryExtractDeclaredLimit(sql), Optional.of(2500)); + } + + @Test + public void testTryExtractDeclaredLimitAbsent() { + String sql = "SELECT DaysSinceEpoch, city FROM orders"; + assertEquals(MaterializedViewAnalyzer.tryExtractDeclaredLimit(sql), Optional.empty()); + } + + // ----------------------------------------------------------------------- + // Step 2: Source table validation + // ----------------------------------------------------------------------- + + @Test + public void testSourceTableNotFound() { + String sql = "SELECT DaysSinceEpoch, city FROM nonexistent_table GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + stubTable("nonexistent_table_OFFLINE", null, null); + stubTable("nonexistent_table_REALTIME", null, null); + + expectError(sql, viewSchema, "does not exist"); + } + + @Test + public void testSourceTableNoTimeColumn() { + TableConfig noTimeConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("no_time_table_OFFLINE") + .build(); + stubTable("no_time_table_OFFLINE", noTimeConfig, null); + + String sql = "SELECT DaysSinceEpoch, city FROM no_time_table GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "has no time column configured"); + } + + @Test + public void testSourceTableNoDateTimeFieldSpec() { + TableConfig withTimeConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("missing_spec_table_OFFLINE") + .setTimeColumnName("missingCol") + .build(); + Schema schemaWithoutSpec = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .build(); + stubTable("missing_spec_table_OFFLINE", withTimeConfig, schemaWithoutSpec); + + String sql = "SELECT DaysSinceEpoch, city FROM missing_spec_table GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "No DateTimeFieldSpec found"); + } + + // ----------------------------------------------------------------------- + // Source-table type eligibility (Step 2): MV's coverage model assumes the base table is + // append-only with monotonically advancing time. Tables whose contents can be replaced or + // rewritten silently — upsert, dedup, dimension, REFRESH-push — must be rejected at create + // time so a known-broken MV cannot land in cluster metadata. + // ----------------------------------------------------------------------- + + @Test + public void testRejectsUpsertSourceTable() { + // Upsert: in-place row replacement breaks the assumption that a VALID time partition is + // immutable; a late update to a covered interval would silently diverge from the MV. + String mutableTable = "orders_upsert"; + TableConfig upsertCfg = new TableConfigBuilder(TableType.REALTIME) + .setTableName(mutableTable) + .setTimeColumnName(TIME_COLUMN) + .setUpsertConfig(new UpsertConfig(UpsertConfig.Mode.FULL)) + .build(); + Schema schema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + stubTable(mutableTable + "_OFFLINE", null, null); + stubTable(mutableTable + "_REALTIME", upsertCfg, schema); + + String sql = "SELECT DaysSinceEpoch, city FROM " + mutableTable + " GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + expectError(sql, viewSchema, "upsert enabled"); + } + + @Test + public void testRejectsDedupSourceTable() { + // Dedup: the de-duplicated view is server-managed and not stable across reloads/TTLs; + // MV would aggregate over a snapshot that the runtime can later disagree with. + String dedupTable = "orders_dedup"; + TableConfig dedupCfg = new TableConfigBuilder(TableType.REALTIME) + .setTableName(dedupTable) + .setTimeColumnName(TIME_COLUMN) + .setDedupConfig(new DedupConfig(true, null)) + .build(); + Schema schema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + stubTable(dedupTable + "_OFFLINE", null, null); + stubTable(dedupTable + "_REALTIME", dedupCfg, schema); + + String sql = "SELECT DaysSinceEpoch, city FROM " + dedupTable + " GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + expectError(sql, viewSchema, "dedup enabled"); + } + + @Test + public void testRejectsDimensionSourceTable() { + // Dimension table: fully replaced on every refresh and has no monotonic time concept; + // the MV's time-partitioned coverage model is meaningless here. + String dimTable = "dim_lookup"; + TableConfig dimCfg = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(dimTable) + .setTimeColumnName(TIME_COLUMN) + .setIsDimTable(true) + .build(); + Schema schema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + stubTable(dimTable + "_OFFLINE", dimCfg, schema); + + String sql = "SELECT DaysSinceEpoch, city FROM " + dimTable + " GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + expectError(sql, viewSchema, "dimension table"); + } + + @Test + public void testRejectsRefreshPushTable() { + // REFRESH push: each push wholesale replaces base segments, so any MV partition already + // marked VALID becomes immediately suspect after the next push. + String refreshTable = "orders_refresh"; + IngestionConfig ingestionCfg = new IngestionConfig(); + ingestionCfg.setBatchIngestionConfig(new BatchIngestionConfig(null, "REFRESH", "DAILY")); + TableConfig refreshCfg = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(refreshTable) + .setTimeColumnName(TIME_COLUMN) + .setIngestionConfig(ingestionCfg) + .build(); + Schema schema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + stubTable(refreshTable + "_OFFLINE", refreshCfg, schema); + + String sql = "SELECT DaysSinceEpoch, city FROM " + refreshTable + " GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + expectError(sql, viewSchema, "REFRESH push type"); + } + + @Test + public void testRejectsRefreshPushTableViaLegacyField() { + // Legacy: REFRESH was set via the deprecated SegmentsValidationAndRetentionConfig field. + // resolveSegmentPushType must fall through to it so older configs cannot bypass the guard. + String refreshTable = "orders_refresh_legacy"; + TableConfig refreshCfg = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(refreshTable) + .setTimeColumnName(TIME_COLUMN) + .setSegmentPushType("REFRESH") + .build(); + Schema schema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + stubTable(refreshTable + "_OFFLINE", refreshCfg, schema); + + String sql = "SELECT DaysSinceEpoch, city FROM " + refreshTable + " GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + expectError(sql, viewSchema, "REFRESH push type"); + } + + @Test + public void testSourceColumnNotExist() { + String sql = "SELECT DaysSinceEpoch, city, sum(nonexistent_col) AS total FROM orders " + + "GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("total", FieldSpec.DataType.DOUBLE) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "does not exist in source table"); + } + + // ----------------------------------------------------------------------- + // Step 3: MV schema column validation + // ----------------------------------------------------------------------- + + @Test + public void testMaterializedViewSchemaColumnNotCoveredBySelect() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addMetric("extra_column", FieldSpec.DataType.DOUBLE) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "is not produced by any SELECT expression"); + } + + @Test + public void testSelectFieldNotInMaterializedViewSchema() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt, sum(amount) AS total " + + "FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "does not match any column in the MV table schema"); + } + + @Test + public void testAggregateWithoutAlias() { + String sql = "SELECT DaysSinceEpoch, city, count(*) FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "must have an AS alias"); + } + + // ----------------------------------------------------------------------- + // Step 4: Task config parameter validation + // ----------------------------------------------------------------------- + + @Test + public void testNonOfflineTableType() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig realtimeConfig = new TableConfigBuilder(TableType.REALTIME) + .setTableName("mv_orders") + .setTimeColumnName(TIME_COLUMN) + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + try { + MaterializedViewAnalyzer.analyze(withLimit(sql), realtimeConfig, viewSchema, taskConfigs, _mockAccessor); + fail("Expected IllegalStateException for non-OFFLINE table"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("only supports OFFLINE"), "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testInvalidBucketTimePeriod() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + Map taskConfigs = buildTaskConfigs(sql); + taskConfigs.put(MaterializedViewTask.BUCKET_TIME_PERIOD_KEY, "not_a_period"); + + expectError(sql, viewSchema, taskConfigs, "Invalid bucketTimePeriod"); + } + + @Test + public void testInvalidMaxNumRecordsPerSegment() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + Map taskConfigs = buildTaskConfigs(sql); + taskConfigs.put(MaterializedViewTask.MAX_NUM_RECORDS_PER_SEGMENT_KEY, "-5"); + + expectError(sql, viewSchema, taskConfigs, "maxNumRecordsPerSegment must be positive"); + } + + @Test + public void testNonNumericMaxNumRecordsPerSegment() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + Map taskConfigs = buildTaskConfigs(sql); + taskConfigs.put(MaterializedViewTask.MAX_NUM_RECORDS_PER_SEGMENT_KEY, "abc"); + + expectError(sql, viewSchema, taskConfigs, "Invalid maxNumRecordsPerSegment"); + } + + // ----------------------------------------------------------------------- + // Complex SQL + // ----------------------------------------------------------------------- + + @Test + public void testComplexSqlWithMultipleAggregations() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt, sum(amount) AS total, " + + "min(amount) AS min_amt, max(amount) AS max_amt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addMetric("total", FieldSpec.DataType.DOUBLE) + .addMetric("min_amt", FieldSpec.DataType.DOUBLE) + .addMetric("max_amt", FieldSpec.DataType.DOUBLE) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + + assertNotNull(result); + assertEquals(result.getSelectFields().size(), 6); + } + + @Test + public void testRejectsRealtimeSourceTable() { + // Realtime source tables are rejected until the controller-side notify path supports realtime + // segment commits (LLC). The fallback OFFLINE-then-REALTIME lookup in resolveSourceTableWithType + // will resolve a realtime-only base; the analyzer must catch that here so a misconfigured MV + // never reaches cluster metadata. + String realtimeTable = "rt_orders_REALTIME"; + TableConfig rtConfig = new TableConfigBuilder(TableType.REALTIME) + .setTableName(realtimeTable) + .setTimeColumnName(TIME_COLUMN) + .build(); + Schema rtSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + stubTable("rt_orders_OFFLINE", null, null); + stubTable(realtimeTable, rtConfig, rtSchema); + + String sql = "SELECT DaysSinceEpoch, city FROM rt_orders"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = buildMaterializedViewTableConfig(); + Map taskConfigs = buildTaskConfigs(sql); + + try { + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + fail("Expected IllegalStateException for REALTIME source table"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("REALTIME"), + "Unexpected message: " + e.getMessage()); + } + } + + // ----------------------------------------------------------------------- + // Step 6: MV time-column alignment (segmentsConfig.timeColumnName) + // ----------------------------------------------------------------------- + + @Test + public void testRejectsWhenMaterializedViewTimeColumnMissing() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + try { + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + fail("Expected IllegalStateException for unset MV timeColumnName"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("segmentsConfig.timeColumnName must be set"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testRejectsWhenMaterializedViewTimeColumnNotInSchema() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + // timeColumnName points to a column that doesn't exist in the MV schema at all. + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("nonexistent_time_col") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + try { + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + fail("Expected IllegalStateException for MV timeColumnName missing from schema"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("MV time column 'nonexistent_time_col' does not exist in MV schema"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testRejectsWhenMaterializedViewTimeColumnIsNotDateTime() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + // timeColumnName points to a plain dimension, not a registered dateTime column. + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("city") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + try { + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + fail("Expected IllegalStateException for MV timeColumnName not being a dateTime column"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("is not a dateTime field in the MV schema"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testRejectsWhenMaterializedViewTimeColumnNotProducedBySelect() { + // Simulates the real-world misconfig: base table time column is DaysSinceEpoch; the + // definedSql transforms it via date_trunc into a coarser 'day' column; but the MV + // TableConfig inherited timeColumnName=DaysSinceEpoch from the base table without + // updating it. The MV will not physically contain DaysSinceEpoch. + String sql = "SELECT date_trunc('DAY', DaysSinceEpoch) AS day, city, count(*) AS cnt " + + "FROM orders GROUP BY day, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime("day", FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName(TIME_COLUMN) + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + try { + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + fail("Expected IllegalStateException for MV timeColumnName not produced by SELECT"); + } catch (IllegalStateException e) { + // The column is absent from the MV schema entirely, so invariant (b) fires first + // with a message that still points the user to the root cause. + assertTrue(e.getMessage().contains("MV time column '" + TIME_COLUMN + "' does not exist in MV schema"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testAcceptsWhenMaterializedViewTimeColumnIsSelectAlias() { + // Happy path: MV renames the time column via DATETRUNC, segmentsConfig.timeColumnName + // points to the SELECT alias. DATETRUNC unit 'DAY' matches bucketTimePeriod '1d'. + String sql = "SELECT DATETRUNC('DAY', DaysSinceEpoch) AS day, city, count(*) AS cnt " + + "FROM orders GROUP BY day, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime("day", FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("day") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + + assertNotNull(result); + assertEquals(result.getPartitionExprMaps().size(), 1); + assertTrue(result.getPartitionExprMaps().containsValue("day"), + "Expected partitionExprMaps to map some base-table expression -> 'day', got: " + + result.getPartitionExprMaps()); + } + + // ----------------------------------------------------------------------- + // Step 7: MV time column TIMESTAMP-only contract (TimeExprValidator) + // + // Per-rule behavior of the validator is covered in TimeExprValidatorTest; here we + // exercise only the end-to-end wiring through analyze(): unsupported function paths, + // nested function paths, and the data-type guard. Format/granularity inference is + // no longer a thing — both base and MV time columns must be TIMESTAMP. + // ----------------------------------------------------------------------- + + @Test + public void testStep7RejectsNonTimestampBaseColumn() { + // setUp() now declares the base column as TIMESTAMP, so override it back to LONG/EPOCH-days + // to confirm Step 7 rejects non-TIMESTAMP base columns. + _sourceSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addSingleValueDimension("status", FieldSpec.DataType.STRING) + .addMetric("amount", FieldSpec.DataType.DOUBLE) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.LONG, "1:DAYS:EPOCH", "1:DAYS") + .build(); + stubTable(SOURCE_TABLE_OFFLINE, _sourceTableConfig, _sourceSchema); + + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt " + + "FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + expectError(sql, viewSchema, "TIMESTAMP"); + } + + @Test + public void testStep7RejectsNonTimestampMaterializedViewColumn() { + String sql = "SELECT DaysSinceEpoch, city, count(*) AS cnt " + + "FROM orders GROUP BY DaysSinceEpoch, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime(TIME_COLUMN, FieldSpec.DataType.LONG, "1:DAYS:EPOCH", "1:DAYS") + .build(); + + expectError(sql, viewSchema, "TIMESTAMP"); + } + + @Test + public void testStep7DatetruncHappyOnTimestampBase() { + String sql = "SELECT DATETRUNC('DAY', DaysSinceEpoch) AS day, city, count(*) AS cnt " + + "FROM orders GROUP BY day, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime("day", FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("day") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + MaterializedViewAnalyzer.AnalysisResult result = + MaterializedViewAnalyzer.analyze(withLimit(sql), viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + assertNotNull(result); + assertTrue(result.getPartitionExprMaps().containsValue("day")); + } + + @Test + public void testStep7DatetruncUnitMismatchesBucketRejected() { + String sql = "SELECT DATETRUNC('HOUR', DaysSinceEpoch) AS hr, city, count(*) AS cnt " + + "FROM orders GROUP BY hr, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime("hr", FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("hr") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + expectErrorRaw(withLimit(sql), viewSchema, viewTableConfig, taskConfigs, + "does not match the declared bucketTimePeriod"); + } + + @Test + public void testStep7UnsupportedFunctionRejected() { + String sql = "SELECT fromEpochDays(DaysSinceEpoch) AS ts_ms, city, count(*) AS cnt " + + "FROM orders GROUP BY ts_ms, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime("ts_ms", FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("ts_ms") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + expectErrorRaw(withLimit(sql), viewSchema, viewTableConfig, taskConfigs, "unsupported function"); + } + + @Test + public void testStep7NestedFunctionRejected() { + // Nested DATETRUNC inside another DATETRUNC: not an identity, not a top-level supported + // function — must be rejected with a TIMESTAMP-only style message. + String sql = "SELECT DATETRUNC('DAY', DATETRUNC('HOUR', DaysSinceEpoch)) AS day, city, count(*) AS cnt " + + "FROM orders GROUP BY day, city"; + Schema viewSchema = new Schema.SchemaBuilder() + .addSingleValueDimension("city", FieldSpec.DataType.STRING) + .addMetric("cnt", FieldSpec.DataType.LONG) + .addDateTime("day", FieldSpec.DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS") + .build(); + + TableConfig viewTableConfig = new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName("day") + .build(); + Map taskConfigs = buildTaskConfigs(sql); + + expectErrorRaw(withLimit(sql), viewSchema, viewTableConfig, taskConfigs, + "second argument must be the base time column"); + } + + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + private TableConfig buildMaterializedViewTableConfig() { + return new TableConfigBuilder(TableType.OFFLINE) + .setTableName("mv_orders") + .setTimeColumnName(TIME_COLUMN) + .build(); + } + + private Map buildTaskConfigs(String sql) { + Map taskConfigs = new HashMap<>(); + taskConfigs.put(MaterializedViewTask.DEFINED_SQL_KEY, sql); + taskConfigs.put(MaterializedViewTask.BUCKET_TIME_PERIOD_KEY, "1d"); + return taskConfigs; + } + + private void expectError(String sql, Schema viewSchema, String expectedMessageFragment) { + expectError(sql, viewSchema, buildTaskConfigs(sql), expectedMessageFragment); + } + + private void expectError(String sql, Schema viewSchema, Map taskConfigs, + String expectedMessageFragment) { + expectErrorRaw(withLimit(sql), viewSchema, taskConfigs, expectedMessageFragment); + } + + /// Same as [Schema, Map, String)][#expectError(String,] but does not append a default + /// LIMIT. Used by tests that intentionally exercise the LIMIT-validation path. + private void expectErrorRaw(String sql, Schema viewSchema, Map taskConfigs, + String expectedMessageFragment) { + expectErrorRaw(sql, viewSchema, buildMaterializedViewTableConfig(), taskConfigs, expectedMessageFragment); + } + + /// Variant that lets the caller supply a custom MV [TableConfig] (e.g. with a + /// SELECT-alias time column name). Step-7 tests need this because the time column is + /// usually an alias of the base time column. + private void expectErrorRaw(String sql, Schema viewSchema, TableConfig viewTableConfig, + Map taskConfigs, String expectedMessageFragment) { + try { + MaterializedViewAnalyzer.analyze(sql, viewTableConfig, viewSchema, taskConfigs, _mockAccessor); + fail("Expected IllegalStateException containing: " + expectedMessageFragment); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains(expectedMessageFragment), + "Expected message containing '" + expectedMessageFragment + "', got: " + e.getMessage()); + } + } + + /// Returns `sql` as-is if it already ends with a LIMIT clause, otherwise appends one. + private static String withLimit(String sql) { + if (sql == null || sql.isEmpty()) { + return sql; + } + return sql.toUpperCase().contains(" LIMIT ") ? sql : sql + DEFAULT_LIMIT; + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidatorTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidatorTest.java new file mode 100644 index 000000000000..18f1dc1c9a6d --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/analysis/timeexpr/TimeExprValidatorTest.java @@ -0,0 +1,238 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.analysis.timeexpr; + +import java.util.concurrent.TimeUnit; +import org.apache.pinot.common.request.Expression; +import org.apache.pinot.spi.data.DateTimeFieldSpec; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.sql.parsers.CalciteSqlParser; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertThrows; +import static org.testng.Assert.expectThrows; + + +public class TimeExprValidatorTest { + + private static final DateTimeFieldSpec TIMESTAMP_FIELD = new DateTimeFieldSpec( + "ts", DataType.TIMESTAMP, "1:MILLISECONDS:TIMESTAMP", "1:MILLISECONDS"); + private static final DateTimeFieldSpec LONG_EPOCH_DAYS_FIELD = new DateTimeFieldSpec( + "ts", DataType.LONG, "1:DAYS:EPOCH", "1:DAYS"); + + private static Expression parseSelectExpr(String selectListItem) { + String sql = "SELECT " + selectListItem + " FROM t"; + return CalciteSqlParser.compileToPinotQuery(sql).getSelectList().get(0); + } + + @Test + public void testIdentityPassthroughAccepted() { + TimeExprValidator.validate( + parseSelectExpr("ts"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.MILLISECONDS.toMillis(1)); + } + + @Test + public void testIdentityRejectedWhenMismatchedBaseColumn() { + IllegalStateException ex = expectThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("other_col"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.MILLISECONDS.toMillis(1))); + String msg = ex.getMessage(); + assert msg.contains("derive from base time column 'ts'") : msg; + } + + @Test + public void testDatetruncAcceptedWhenUnitMatchesBucket() { + TimeExprValidator.validate( + parseSelectExpr("DATETRUNC('DAY', ts)"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1)); + } + + @Test + public void testDatetruncRejectedWhenUnitMismatchesBucket() { + IllegalStateException ex = expectThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("DATETRUNC('HOUR', ts)"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + assert ex.getMessage().contains("does not match the declared bucketTimePeriod"); + } + + @Test + public void testDatetruncRejectedForCalendarUnit() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("DATETRUNC('WEEK', ts)"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(7))); + } + + @Test + public void testDatetruncRejectedWhenSecondArgIsNotBaseColumn() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("DATETRUNC('DAY', other_col)"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + } + + @Test + public void testRejectsNonTimestampBaseColumn() { + IllegalStateException ex = expectThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("ts"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + assert ex.getMessage().contains("TIMESTAMP"); + } + + @Test + public void testRejectsNonTimestampViewColumn() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("ts"), + "ts", TIMESTAMP_FIELD, + "tsMv", LONG_EPOCH_DAYS_FIELD, + TimeUnit.MILLISECONDS.toMillis(1))); + } + + @Test + public void testRejectsDateTimeConvert() { + IllegalStateException ex = expectThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("dateTimeConvert(ts, '1:MILLISECONDS:EPOCH', '1:DAYS:EPOCH', '1:DAYS')"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + assert ex.getMessage().contains("unsupported function"); + } + + @Test + public void testRejectsToDateTime() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("toDateTime(ts, 'yyyy-MM-dd')"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + } + + @Test + public void testDatetruncRejectsNonDefaultTimezone() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("DATETRUNC('DAY', ts, 'MILLISECONDS', 'America/Los_Angeles')"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + } + + @Test + public void testArithmeticScalingAcceptedFromNonTimestampBase() { + // Base is INT-days, MV is TIMESTAMP; SELECT multiplies days by ms-per-day to produce millis. + TimeExprValidator.validate( + parseSelectExpr("ts * 86400000"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1)); + } + + @Test + public void testArithmeticScalingAcceptedWithChainedMultiplication() { + // Same as above but the user broke the scale factor into a chain (24*60*60*1000). + TimeExprValidator.validate( + parseSelectExpr("ts * 24 * 60 * 60 * 1000"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1)); + } + + @Test + public void testArithmeticScalingRejectedWhenBaseColumnNotReferenced() { + IllegalStateException ex = expectThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("other_col * 86400000"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + assert ex.getMessage().contains("base time column 'ts'") : ex.getMessage(); + } + + @Test + public void testArithmeticScalingRejectedWhenBaseColumnReferencedTwice() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("ts * ts"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + } + + @Test + public void testArithmeticScalingRejectsAddition() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("ts + 86400000"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + } + + @Test + public void testArithmeticScalingRejectsNonPositiveLiteral() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("ts * 0"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + } + + @Test + public void testIdentityRejectedWhenBaseIsNonTimestamp() { + IllegalStateException ex = expectThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("ts"), + "ts", LONG_EPOCH_DAYS_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + assert ex.getMessage().contains("Identity passthrough requires") : ex.getMessage(); + } + + @Test + public void testDatetruncRejectsNonDefaultInputTimeUnit() { + assertThrows(IllegalStateException.class, () -> + TimeExprValidator.validate( + parseSelectExpr("DATETRUNC('DAY', ts, 'SECONDS')"), + "ts", TIMESTAMP_FIELD, + "tsMv", TIMESTAMP_FIELD, + TimeUnit.DAYS.toMillis(1))); + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManagerTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManagerTest.java new file mode 100644 index 000000000000..914c1c9b5b62 --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/consistency/MaterializedViewConsistencyManagerTest.java @@ -0,0 +1,216 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.consistency; + +import java.util.List; +import java.util.Map; +import org.apache.helix.AccessOption; +import org.apache.helix.store.zk.ZkHelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.helix.zookeeper.zkclient.IZkChildListener; +import org.apache.pinot.common.metadata.ZKMetadataProvider; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadata; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadata; +import org.apache.pinot.materializedview.metadata.PartitionFingerprint; +import org.apache.pinot.materializedview.metadata.PartitionInfo; +import org.apache.pinot.materializedview.metadata.PartitionState; +import org.apache.zookeeper.data.Stat; +import org.mockito.ArgumentCaptor; +import org.testng.annotations.Test; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; + + +public class MaterializedViewConsistencyManagerTest { + private static final String BASE_TABLE = "baseTable"; + private static final String MV_TABLE = "mvTable_OFFLINE"; + private static final long BUCKET_MS = 86_400_000L; + + @Test + public void testEpochZeroRangeMarksPartitionStale() + throws Exception { + ZkHelixPropertyStore propertyStore = mock(ZkHelixPropertyStore.class); + String runtimePath = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(MV_TABLE); + MaterializedViewRuntimeMetadata runtime = new MaterializedViewRuntimeMetadata( + MV_TABLE, 2 * BUCKET_MS, + Map.of( + 0L, validInfo(), + BUCKET_MS, validInfo())); + when(propertyStore.get(eq(runtimePath), any(Stat.class), eq(AccessOption.PERSISTENT))) + .thenReturn(runtime.toZNRecord()); + when(propertyStore.set(eq(runtimePath), any(ZNRecord.class), eq(0), eq(AccessOption.PERSISTENT))) + .thenReturn(true); + + MaterializedViewConsistencyManager manager = new MaterializedViewConsistencyManager(); + manager.init(propertyStore); + manager.onMaterializedViewTableCreated(MV_TABLE, List.of(BASE_TABLE)); + manager.onBaseTableDataChange(BASE_TABLE, 0L, BUCKET_MS - 1); + + manager.flush(BASE_TABLE); + manager.stop(); + + ArgumentCaptor recordCaptor = ArgumentCaptor.forClass(ZNRecord.class); + verify(propertyStore).set(eq(runtimePath), recordCaptor.capture(), eq(0), eq(AccessOption.PERSISTENT)); + MaterializedViewRuntimeMetadata updated = + MaterializedViewRuntimeMetadata.fromZNRecord(recordCaptor.getValue()); + assertEquals(updated.getPartitions().get(0L).getState(), PartitionState.STALE); + assertEquals(updated.getPartitions().get(BUCKET_MS).getState(), PartitionState.VALID); + } + + @Test + public void testDefinitionCreatedAfterInitRegistersBaseTableMapping() + throws Exception { + ZkHelixPropertyStore propertyStore = mock(ZkHelixPropertyStore.class); + String definitionParentPath = ZKMetadataProvider.getPropertyStorePathForMaterializedViewDefinitionPrefix(); + String definitionPath = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewDefinition(MV_TABLE); + String runtimePath = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(MV_TABLE); + + when(propertyStore.getChildNames(eq(definitionParentPath), eq(AccessOption.PERSISTENT))) + .thenReturn(List.of(), List.of(), List.of(MV_TABLE)); + MaterializedViewDefinitionMetadata definition = new MaterializedViewDefinitionMetadata( + MV_TABLE, List.of(BASE_TABLE), "SELECT count(*) FROM baseTable", Map.of(), null); + when(propertyStore.get(eq(definitionPath), any(), eq(AccessOption.PERSISTENT))) + .thenReturn(definition.toZNRecord()); + + MaterializedViewRuntimeMetadata runtime = new MaterializedViewRuntimeMetadata( + MV_TABLE, 2 * BUCKET_MS, + Map.of( + 0L, validInfo(), + BUCKET_MS, validInfo())); + when(propertyStore.get(eq(runtimePath), any(Stat.class), eq(AccessOption.PERSISTENT))) + .thenReturn(runtime.toZNRecord()); + when(propertyStore.set(eq(runtimePath), any(ZNRecord.class), eq(0), eq(AccessOption.PERSISTENT))) + .thenReturn(true); + + MaterializedViewConsistencyManager manager = new MaterializedViewConsistencyManager(); + manager.init(propertyStore); + + ArgumentCaptor childListenerCaptor = ArgumentCaptor.forClass(IZkChildListener.class); + verify(propertyStore).subscribeChildChanges(eq(definitionParentPath), childListenerCaptor.capture()); + childListenerCaptor.getValue().handleChildChange(definitionParentPath, List.of(MV_TABLE)); + + manager.onBaseTableDataChange(BASE_TABLE, 0L, BUCKET_MS - 1); + manager.flush(BASE_TABLE); + manager.stop(); + + ArgumentCaptor recordCaptor = ArgumentCaptor.forClass(ZNRecord.class); + verify(propertyStore).set(eq(runtimePath), recordCaptor.capture(), eq(0), eq(AccessOption.PERSISTENT)); + MaterializedViewRuntimeMetadata updated = + MaterializedViewRuntimeMetadata.fromZNRecord(recordCaptor.getValue()); + assertEquals(updated.getPartitions().get(0L).getState(), PartitionState.STALE); + assertEquals(updated.getPartitions().get(BUCKET_MS).getState(), PartitionState.VALID); + } + + /// Regression test for M1: full-range invalidation must NOT create synthetic STALE entries + /// for buckets that are not present in the partition map. Under Design C, absent buckets + /// mean "MV does not cover this range"; the broker routes those queries to the base. + @Test + public void testFullInvalidationDoesNotSynthesizeAbsentBuckets() + throws Exception { + ZkHelixPropertyStore propertyStore = mock(ZkHelixPropertyStore.class); + String runtimePath = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(MV_TABLE); + // Watermark covers 10 buckets but only two adjacent buckets (#5, #6) are materialized. + // Adjacent so `inferBucketMsFromPartitions` derives BUCKET_MS from the gap. Pre-fix, the + // full-range invalidation would have synthesized 8 STALE entries for absent buckets; + // post-fix we expect just the two real buckets to flip to STALE. + MaterializedViewRuntimeMetadata runtime = new MaterializedViewRuntimeMetadata( + MV_TABLE, 10 * BUCKET_MS, + Map.of(5 * BUCKET_MS, validInfo(), + 6 * BUCKET_MS, validInfo())); + when(propertyStore.get(eq(runtimePath), any(Stat.class), eq(AccessOption.PERSISTENT))) + .thenReturn(runtime.toZNRecord()); + when(propertyStore.set(eq(runtimePath), any(ZNRecord.class), eq(0), eq(AccessOption.PERSISTENT))) + .thenReturn(true); + + MaterializedViewConsistencyManager manager = new MaterializedViewConsistencyManager(); + manager.init(propertyStore); + manager.onMaterializedViewTableCreated(MV_TABLE, List.of(BASE_TABLE)); + manager.onBaseTableFullInvalidation(BASE_TABLE); + + manager.flush(BASE_TABLE); + manager.stop(); + + ArgumentCaptor recordCaptor = ArgumentCaptor.forClass(ZNRecord.class); + verify(propertyStore).set(eq(runtimePath), recordCaptor.capture(), eq(0), eq(AccessOption.PERSISTENT)); + MaterializedViewRuntimeMetadata updated = + MaterializedViewRuntimeMetadata.fromZNRecord(recordCaptor.getValue()); + assertEquals(updated.getPartitions().size(), 2, + "Full invalidation must not synthesize absent-bucket STALE entries"); + assertEquals(updated.getPartitions().get(5 * BUCKET_MS).getState(), PartitionState.STALE); + assertEquals(updated.getPartitions().get(6 * BUCKET_MS).getState(), PartitionState.STALE); + } + + /// Regression test for M3 + the typed-exception narrowing on persist(): a CAS conflict + /// during STALE marking is silently retried; the retry succeeds. + @Test + public void testCasConflictTriggersRetry() + throws Exception { + ZkHelixPropertyStore propertyStore = mock(ZkHelixPropertyStore.class); + String runtimePath = ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime(MV_TABLE); + MaterializedViewRuntimeMetadata runtime = new MaterializedViewRuntimeMetadata( + MV_TABLE, 2 * BUCKET_MS, + Map.of(0L, validInfo(), BUCKET_MS, validInfo())); + when(propertyStore.get(eq(runtimePath), any(Stat.class), eq(AccessOption.PERSISTENT))) + .thenReturn(runtime.toZNRecord()); + // First write fails (CAS conflict), second succeeds. Retry must converge. + when(propertyStore.set(eq(runtimePath), any(ZNRecord.class), eq(0), eq(AccessOption.PERSISTENT))) + .thenReturn(false, true); + + MaterializedViewConsistencyManager manager = new MaterializedViewConsistencyManager(); + manager.init(propertyStore); + manager.onMaterializedViewTableCreated(MV_TABLE, List.of(BASE_TABLE)); + manager.onBaseTableDataChange(BASE_TABLE, 0L, BUCKET_MS - 1); + + manager.flush(BASE_TABLE); + manager.stop(); + + // Two set() invocations expected — first returned false (CAS conflict), second succeeded. + verify(propertyStore, org.mockito.Mockito.times(2)) + .set(eq(runtimePath), any(ZNRecord.class), eq(0), eq(AccessOption.PERSISTENT)); + } + + /// Regression test for the lock-free fast-path on onBaseTableDataChange when no MV + /// references the base table — must return without touching the property store. + @Test + public void testNoDependentMvSkipsPropertyStore() + throws Exception { + ZkHelixPropertyStore propertyStore = mock(ZkHelixPropertyStore.class); + + MaterializedViewConsistencyManager manager = new MaterializedViewConsistencyManager(); + manager.init(propertyStore); + // No onMaterializedViewTableCreated call; reverse index is empty. + manager.onBaseTableDataChange(BASE_TABLE, 0L, 1_000_000L); + manager.flush(BASE_TABLE); + manager.stop(); + + // The property-store `set` MUST never be invoked when no MV depends on the base table. + verify(propertyStore, org.mockito.Mockito.never()) + .set(org.mockito.ArgumentMatchers.anyString(), any(ZNRecord.class), + org.mockito.ArgumentMatchers.anyInt(), org.mockito.ArgumentMatchers.anyInt()); + } + + private static PartitionInfo validInfo() { + return new PartitionInfo(PartitionState.VALID, new PartitionFingerprint(1, 1234L), 10L); + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutorTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutorTest.java new file mode 100644 index 000000000000..74dbbe105777 --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/executor/GrpcMaterializedViewQueryExecutorTest.java @@ -0,0 +1,211 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.executor; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.helix.HelixManager; +import org.apache.helix.model.InstanceConfig; +import org.apache.pinot.common.config.GrpcConfig; +import org.apache.pinot.spi.utils.CommonConstants; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + + +public class GrpcMaterializedViewQueryExecutorTest { + private HelixManager _helixManager; + private GrpcMaterializedViewQueryExecutor _queryExecutor; + + @BeforeMethod + public void setUp() { + _helixManager = mock(HelixManager.class); + _queryExecutor = new GrpcMaterializedViewQueryExecutor(_helixManager, new GrpcConfig(Collections.emptyMap())); + } + + @AfterMethod + public void tearDown() { + _queryExecutor.close(); + } + + @Test + public void testSelectBrokerRoundRobin() { + List configs = new ArrayList<>(); + configs.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 8090)); + configs.add(buildBrokerConfig("Broker_broker2_8099", "broker2", 8091)); + configs.add(buildBrokerConfig("Broker_broker3_8099", "broker3", 8092)); + when(_helixManager.getClusterName()).thenReturn("testCluster"); + when(_helixManager.getInstanceName()).thenReturn("Minion_minion1_9514"); + mockHelixInstanceConfigs(configs); + + Set selectedHosts = new HashSet<>(); + for (int i = 0; i < 6; i++) { + Pair broker = _queryExecutor.selectBroker(); + selectedHosts.add(broker.getLeft() + ":" + broker.getRight()); + } + + assertEquals(selectedHosts.size(), 3, "All 3 brokers should be selected via round-robin"); + assertTrue(selectedHosts.contains("broker1:8090")); + assertTrue(selectedHosts.contains("broker2:8091")); + assertTrue(selectedHosts.contains("broker3:8092")); + } + + @Test + public void testSelectBrokerSingleBroker() { + List configs = new ArrayList<>(); + configs.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 8090)); + mockHelixInstanceConfigs(configs); + + for (int i = 0; i < 3; i++) { + Pair broker = _queryExecutor.selectBroker(); + assertEquals(broker.getLeft(), "broker1"); + assertEquals(broker.getRight().intValue(), 8090); + } + } + + @Test + public void testSelectBrokerNoBrokersThrows() { + mockHelixInstanceConfigs(Collections.emptyList()); + + try { + _queryExecutor.selectBroker(); + fail("Expected IllegalStateException when no gRPC-enabled brokers exist"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("No broker with gRPC enabled")); + } + } + + @Test + public void testSelectBrokerSkipsBrokersWithoutGrpcPort() { + List configs = new ArrayList<>(); + configs.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 0)); + configs.add(buildBrokerConfig("Broker_broker2_8099", "broker2", 8091)); + + InstanceConfig noGrpcBroker = new InstanceConfig("Broker_broker3_8099"); + noGrpcBroker.setHostName("broker3"); + configs.add(noGrpcBroker); + + mockHelixInstanceConfigs(configs); + + for (int i = 0; i < 3; i++) { + Pair broker = _queryExecutor.selectBroker(); + assertEquals(broker.getLeft(), "broker2"); + assertEquals(broker.getRight().intValue(), 8091); + } + } + + @Test + public void testSelectBrokerSkipsNonBrokerInstances() { + List configs = new ArrayList<>(); + configs.add(buildServerConfig("Server_server1_8098", "server1", 8090)); + configs.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 8090)); + mockHelixInstanceConfigs(configs); + + Pair broker = _queryExecutor.selectBroker(); + assertEquals(broker.getLeft(), "broker1"); + } + + @Test + public void testStaleClientEviction() { + List twoConfigs = new ArrayList<>(); + twoConfigs.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 8090)); + twoConfigs.add(buildBrokerConfig("Broker_broker2_8099", "broker2", 8091)); + mockHelixInstanceConfigs(twoConfigs); + + _queryExecutor.selectBroker(); + _queryExecutor.selectBroker(); + assertEquals(_queryExecutor.getCachedClientCount(), 2); + + List oneConfig = new ArrayList<>(); + oneConfig.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 8090)); + mockHelixInstanceConfigs(oneConfig); + + _queryExecutor.selectBroker(); + assertEquals(_queryExecutor.getCachedClientCount(), 1); + } + + @Test + public void testClientReuse() { + List configs = new ArrayList<>(); + configs.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 8090)); + mockHelixInstanceConfigs(configs); + + _queryExecutor.selectBroker(); + _queryExecutor.selectBroker(); + _queryExecutor.selectBroker(); + + assertEquals(_queryExecutor.getCachedClientCount(), 1, + "Repeated selections of the same broker should reuse the cached client"); + } + + @Test + public void testCloseEvictsAllClients() { + List configs = new ArrayList<>(); + configs.add(buildBrokerConfig("Broker_broker1_8099", "broker1", 8090)); + configs.add(buildBrokerConfig("Broker_broker2_8099", "broker2", 8091)); + mockHelixInstanceConfigs(configs); + + _queryExecutor.selectBroker(); + _queryExecutor.selectBroker(); + assertEquals(_queryExecutor.getCachedClientCount(), 2); + + _queryExecutor.close(); + assertEquals(_queryExecutor.getCachedClientCount(), 0); + } + + private InstanceConfig buildBrokerConfig(String instanceName, String hostname, int grpcPort) { + InstanceConfig config = new InstanceConfig(instanceName); + config.setHostName(hostname); + if (grpcPort > 0) { + config.getRecord().setSimpleField(CommonConstants.Helix.Instance.GRPC_PORT_KEY, + String.valueOf(grpcPort)); + } + return config; + } + + private InstanceConfig buildServerConfig(String instanceName, String hostname, int grpcPort) { + InstanceConfig config = new InstanceConfig(instanceName); + config.setHostName(hostname); + if (grpcPort > 0) { + config.getRecord().setSimpleField(CommonConstants.Helix.Instance.GRPC_PORT_KEY, + String.valueOf(grpcPort)); + } + return config; + } + + private void mockHelixInstanceConfigs(List configs) { + org.apache.helix.HelixDataAccessor accessor = mock(org.apache.helix.HelixDataAccessor.class); + org.apache.helix.PropertyKey.Builder keyBuilder = mock(org.apache.helix.PropertyKey.Builder.class); + org.apache.helix.PropertyKey propertyKey = mock(org.apache.helix.PropertyKey.class); + when(_helixManager.getHelixDataAccessor()).thenReturn(accessor); + when(accessor.keyBuilder()).thenReturn(keyBuilder); + when(keyBuilder.instanceConfigs()).thenReturn(propertyKey); + when(accessor.getChildValues(propertyKey, true)).thenReturn(new ArrayList<>(configs)); + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/MaterializedViewMetadataTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/MaterializedViewMetadataTest.java new file mode 100644 index 000000000000..36ae931415cb --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/MaterializedViewMetadataTest.java @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.pinot.materializedview.metadata.MaterializedViewDefinitionMetadata.MaterializedViewSplitSpec; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + + +public class MaterializedViewMetadataTest { + + @Test + public void testDefinitionRoundTrip() { + String viewTableName = "mv_daily_order_amount_OFFLINE"; + Map partitionExprMaps = new HashMap<>(); + partitionExprMaps.put("DaysSinceEpoch", "DaysSinceEpoch"); + + MaterializedViewSplitSpec splitSpec = new MaterializedViewSplitSpec( + "ts", "1:MILLISECONDS:EPOCH", "DaysSinceEpoch", 86400000L); + + MaterializedViewDefinitionMetadata original = new MaterializedViewDefinitionMetadata( + viewTableName, + Arrays.asList("orders_OFFLINE", "products_OFFLINE"), + "SELECT DaysSinceEpoch, city, count(*) as cnt FROM orders GROUP BY DaysSinceEpoch, city", + partitionExprMaps, + splitSpec); + + ZNRecord znRecord = original.toZNRecord(); + assertEquals(znRecord.getId(), viewTableName); + + MaterializedViewDefinitionMetadata restored = MaterializedViewDefinitionMetadata.fromZNRecord(znRecord); + assertEquals(restored.getMaterializedViewTableNameWithType(), viewTableName); + assertEquals(restored.getBaseTables(), Arrays.asList("orders_OFFLINE", "products_OFFLINE")); + assertNotNull(restored.getDefinedSql()); + assertEquals(restored.getPartitionExprMaps().size(), 1); + assertEquals(restored.getPartitionExprMaps().get("DaysSinceEpoch"), "DaysSinceEpoch"); + + MaterializedViewSplitSpec restoredSpec = restored.getSplitSpec(); + assertNotNull(restoredSpec); + assertEquals(restoredSpec.getSourceTimeColumn(), "ts"); + assertEquals(restoredSpec.getMaterializedViewTimeColumn(), "DaysSinceEpoch"); + assertEquals(restoredSpec.getBucketMs(), 86400000L); + } + + @Test + public void testDefinitionWithNoSplitSpec() { + MaterializedViewDefinitionMetadata metadata = new MaterializedViewDefinitionMetadata( + "mv_OFFLINE", + Collections.singletonList("src_OFFLINE"), + null, + Collections.emptyMap(), + null); + + ZNRecord znRecord = metadata.toZNRecord(); + MaterializedViewDefinitionMetadata restored = MaterializedViewDefinitionMetadata.fromZNRecord(znRecord); + + assertEquals(restored.getBaseTables(), Collections.singletonList("src_OFFLINE")); + assertNull(restored.getDefinedSql()); + assertTrue(restored.getPartitionExprMaps().isEmpty()); + assertNull(restored.getSplitSpec()); + } + + @Test + public void testRuntimeRoundTrip() { + Map partitions = new HashMap<>(); + partitions.put(86400000L, new PartitionInfo( + PartitionState.VALID, new PartitionFingerprint(10, 5000L), 1700010000000L)); + partitions.put(172800000L, new PartitionInfo( + PartitionState.STALE, new PartitionFingerprint(8, 3200L), 1700090000000L)); + + MaterializedViewRuntimeMetadata original = new MaterializedViewRuntimeMetadata( + "mv_test_OFFLINE", 259200000L, partitions); + + ZNRecord znRecord = original.toZNRecord(); + MaterializedViewRuntimeMetadata restored = MaterializedViewRuntimeMetadata.fromZNRecord(znRecord); + + assertEquals(restored.getMaterializedViewTableNameWithType(), "mv_test_OFFLINE"); + assertEquals(restored.getWatermarkMs(), 259200000L); + assertEquals(restored.getPartitions().size(), 2); + + PartitionInfo info1 = restored.getPartitions().get(86400000L); + assertEquals(info1.getState(), PartitionState.VALID); + assertEquals(info1.getFingerprint(), new PartitionFingerprint(10, 5000L)); + assertEquals(info1.getLastRefreshTime(), 1700010000000L); + + assertEquals(restored.getPartitions().get(172800000L).getState(), PartitionState.STALE); + } + + @Test + public void testRuntimeEmptyPartitions() { + MaterializedViewRuntimeMetadata metadata = new MaterializedViewRuntimeMetadata( + "mv_OFFLINE", 0L, new HashMap<>()); + + ZNRecord znRecord = metadata.toZNRecord(); + MaterializedViewRuntimeMetadata restored = MaterializedViewRuntimeMetadata.fromZNRecord(znRecord); + + assertEquals(restored.getWatermarkMs(), 0L); + assertTrue(restored.getPartitions().isEmpty()); + } + + @Test + public void testColdStartWatermarkOnlyEmptyPartitions() { + MaterializedViewRuntimeMetadata metadata = new MaterializedViewRuntimeMetadata( + "mv_OFFLINE", 86400000L, new HashMap<>()); + + ZNRecord znRecord = metadata.toZNRecord(); + MaterializedViewRuntimeMetadata restored = MaterializedViewRuntimeMetadata.fromZNRecord(znRecord); + + assertEquals(restored.getWatermarkMs(), 86400000L); + assertTrue(restored.getPartitions().isEmpty()); + } + + @Test + public void testValidateForPersistAcceptsAnyValidState() { + MaterializedViewRuntimeMetadata legitimate = new MaterializedViewRuntimeMetadata( + "mv_OFFLINE", 200L, new HashMap<>()); + legitimate.validateForPersist(); // must not throw — no cross-field invariants under Design C + } + + @Test + public void testWatermarkAlwaysWritten() { + // Even at cold-start (watermark=0), the watermark key must be written so the reader + // round-trips zero correctly without depending on field-presence heuristics. + MaterializedViewRuntimeMetadata metadata = new MaterializedViewRuntimeMetadata( + "mv_OFFLINE", 0L, new HashMap<>()); + ZNRecord znRecord = metadata.toZNRecord(); + assertTrue(znRecord.getSimpleFields().containsKey("watermarkMs"), + "watermarkMs key must always be written"); + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionFingerprintTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionFingerprintTest.java new file mode 100644 index 000000000000..703306612e2f --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionFingerprintTest.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import java.util.HashMap; +import java.util.Map; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class PartitionFingerprintTest { + + @Test + public void testEncodeAndDecode() { + PartitionFingerprint fp = new PartitionFingerprint(5, 123456789L); + String encoded = fp.encode(); + assertEquals(encoded, "5,123456789"); + + PartitionFingerprint decoded = PartitionFingerprint.decode(encoded); + assertEquals(decoded.getSegmentCount(), 5); + assertEquals(decoded.getCrcChecksum(), 123456789L); + assertEquals(decoded, fp); + } + + @Test + public void testEncodeAndDecodeZeroValues() { + PartitionFingerprint fp = new PartitionFingerprint(0, 0L); + String encoded = fp.encode(); + assertEquals(encoded, "0,0"); + + PartitionFingerprint decoded = PartitionFingerprint.decode(encoded); + assertEquals(decoded.getSegmentCount(), 0); + assertEquals(decoded.getCrcChecksum(), 0L); + assertEquals(decoded, fp); + } + + @Test + public void testEncodeAndDecodeNegativeCrc() { + PartitionFingerprint fp = new PartitionFingerprint(3, -999L); + PartitionFingerprint decoded = PartitionFingerprint.decode(fp.encode()); + assertEquals(decoded, fp); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testDecodeInvalidNoSeparator() { + PartitionFingerprint.decode("12345"); + } + + @Test + public void testEncodeMapEmpty() { + assertEquals(PartitionFingerprint.encodeMap(new HashMap<>()), ""); + assertEquals(PartitionFingerprint.encodeMap(null), ""); + } + + @Test + public void testEncodeMapSingleEntry() { + Map map = new HashMap<>(); + map.put(1700006400000L, new PartitionFingerprint(10, 5000L)); + + String encoded = PartitionFingerprint.encodeMap(map); + assertEquals(encoded, "1700006400000=10,5000"); + + Map decoded = PartitionFingerprint.decodeMap(encoded); + assertEquals(decoded.size(), 1); + assertEquals(decoded.get(1700006400000L), new PartitionFingerprint(10, 5000L)); + } + + @Test + public void testEncodeMapMultipleEntries() { + Map map = new HashMap<>(); + map.put(1000L, new PartitionFingerprint(1, 100L)); + map.put(2000L, new PartitionFingerprint(2, 200L)); + map.put(3000L, new PartitionFingerprint(3, 300L)); + + String encoded = PartitionFingerprint.encodeMap(map); + Map decoded = PartitionFingerprint.decodeMap(encoded); + + assertEquals(decoded.size(), 3); + assertEquals(decoded.get(1000L), new PartitionFingerprint(1, 100L)); + assertEquals(decoded.get(2000L), new PartitionFingerprint(2, 200L)); + assertEquals(decoded.get(3000L), new PartitionFingerprint(3, 300L)); + } + + @Test + public void testDecodeMapEmptyString() { + assertTrue(PartitionFingerprint.decodeMap("").isEmpty()); + assertTrue(PartitionFingerprint.decodeMap(null).isEmpty()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testDecodeMapInvalidEntry() { + PartitionFingerprint.decodeMap("badentry"); + } + + @Test + public void testEqualsAndHashCode() { + PartitionFingerprint a = new PartitionFingerprint(5, 999L); + PartitionFingerprint b = new PartitionFingerprint(5, 999L); + PartitionFingerprint c = new PartitionFingerprint(5, 998L); + PartitionFingerprint d = new PartitionFingerprint(4, 999L); + + assertEquals(a, b); + assertEquals(a.hashCode(), b.hashCode()); + assertNotEquals(a, c); + assertNotEquals(a, d); + assertNotEquals(a, null); + } + + @Test + public void testToString() { + PartitionFingerprint fp = new PartitionFingerprint(3, 42L); + assertTrue(fp.toString().contains("segmentCount=3")); + assertTrue(fp.toString().contains("crcChecksum=42")); + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionInfoTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionInfoTest.java new file mode 100644 index 000000000000..4a0cbf1decb3 --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionInfoTest.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import java.util.HashMap; +import java.util.Map; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class PartitionInfoTest { + + @Test + public void testToAndFromFieldMapValid() { + PartitionInfo info = new PartitionInfo( + PartitionState.VALID, new PartitionFingerprint(10, 5000L), 1700006400000L); + Map fieldMap = info.toFieldMap(); + assertEquals(fieldMap.get("state"), "V"); + assertEquals(fieldMap.get("segmentCount"), "10"); + assertEquals(fieldMap.get("crc"), "5000"); + assertEquals(fieldMap.get("lastRefreshTime"), "1700006400000"); + + PartitionInfo decoded = PartitionInfo.fromFieldMap(fieldMap); + assertEquals(decoded.getState(), PartitionState.VALID); + assertEquals(decoded.getFingerprint().getSegmentCount(), 10); + assertEquals(decoded.getFingerprint().getCrcChecksum(), 5000L); + assertEquals(decoded.getLastRefreshTime(), 1700006400000L); + assertEquals(decoded, info); + } + + @Test + public void testToAndFromFieldMapStale() { + PartitionInfo info = new PartitionInfo( + PartitionState.STALE, new PartitionFingerprint(3, -999L), 0L); + Map fieldMap = info.toFieldMap(); + assertEquals(fieldMap.get("state"), "S"); + + PartitionInfo decoded = PartitionInfo.fromFieldMap(fieldMap); + assertEquals(decoded.getState(), PartitionState.STALE); + assertEquals(decoded.getFingerprint(), new PartitionFingerprint(3, -999L)); + assertEquals(decoded.getLastRefreshTime(), 0L); + assertEquals(decoded, info); + } + + @Test + public void testToAndFromFieldMapZeroValues() { + PartitionInfo info = new PartitionInfo( + PartitionState.VALID, new PartitionFingerprint(0, 0L), 0L); + PartitionInfo decoded = PartitionInfo.fromFieldMap(info.toFieldMap()); + assertEquals(decoded, info); + } + + @Test + public void testFromFieldMapIgnoresUnknownKeys() { + Map fieldMap = new HashMap<>(); + fieldMap.put("state", "V"); + fieldMap.put("segmentCount", "10"); + fieldMap.put("crc", "5000"); + fieldMap.put("lastRefreshTime", "1700006400000"); + fieldMap.put("unknownFutureField", "something"); + + PartitionInfo decoded = PartitionInfo.fromFieldMap(fieldMap); + assertEquals(decoded.getState(), PartitionState.VALID); + assertEquals(decoded.getFingerprint().getSegmentCount(), 10); + } + + @Test + public void testWithState() { + PartitionFingerprint fp = new PartitionFingerprint(5, 1234L); + PartitionInfo valid = new PartitionInfo(PartitionState.VALID, fp, 1000L); + PartitionInfo stale = valid.withState(PartitionState.STALE); + + assertEquals(stale.getState(), PartitionState.STALE); + assertEquals(stale.getFingerprint(), fp); + assertEquals(stale.getLastRefreshTime(), 1000L); + assertNotEquals(stale, valid); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testFromFieldMapMissingState() { + Map map = new HashMap<>(); + map.put("segmentCount", "10"); + map.put("crc", "5000"); + map.put("lastRefreshTime", "1000"); + PartitionInfo.fromFieldMap(map); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testFromFieldMapInvalidState() { + Map map = new HashMap<>(); + map.put("state", "X"); + map.put("segmentCount", "10"); + map.put("crc", "5000"); + map.put("lastRefreshTime", "1000"); + PartitionInfo.fromFieldMap(map); + } + + @Test + public void testEqualsAndHashCode() { + PartitionFingerprint fp = new PartitionFingerprint(5, 100L); + PartitionInfo a = new PartitionInfo(PartitionState.VALID, fp, 1000L); + PartitionInfo b = new PartitionInfo(PartitionState.VALID, fp, 1000L); + PartitionInfo c = new PartitionInfo(PartitionState.STALE, fp, 1000L); + PartitionInfo d = new PartitionInfo(PartitionState.VALID, fp, 2000L); + + assertEquals(a, b); + assertEquals(a.hashCode(), b.hashCode()); + assertNotEquals(a, c); + assertNotEquals(a, d); + assertNotEquals(a, null); + } + + @Test + public void testToString() { + PartitionInfo info = new PartitionInfo( + PartitionState.VALID, new PartitionFingerprint(3, 42L), 999L); + String str = info.toString(); + assertTrue(str.contains("VALID")); + assertTrue(str.contains("lastRefreshTime=999")); + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionStateTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionStateTest.java new file mode 100644 index 000000000000..7d6bf1fd57e6 --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/metadata/PartitionStateTest.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.metadata; + +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class PartitionStateTest { + + @Test + public void testEncodeValid() { + assertEquals(PartitionState.VALID.encode(), "V"); + } + + @Test + public void testEncodeStale() { + assertEquals(PartitionState.STALE.encode(), "S"); + } + + @Test + public void testDecodeValid() { + assertEquals(PartitionState.decode("V"), PartitionState.VALID); + } + + @Test + public void testDecodeStale() { + assertEquals(PartitionState.decode("S"), PartitionState.STALE); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testDecodeUnknownCodeRejected() { + PartitionState.decode("E"); + } + + @Test + public void testRoundTrip() { + for (PartitionState state : PartitionState.values()) { + assertEquals(PartitionState.decode(state.encode()), state); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testDecodeUnknownCode() { + PartitionState.decode("X"); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testDecodeEmptyString() { + PartitionState.decode(""); + } +} diff --git a/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskSchedulerTest.java b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskSchedulerTest.java new file mode 100644 index 000000000000..07888dbcb6f3 --- /dev/null +++ b/pinot-materialized-view/src/test/java/org/apache/pinot/materializedview/scheduler/MaterializedViewTaskSchedulerTest.java @@ -0,0 +1,238 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.materializedview.scheduler; + +import java.util.Collections; +import java.util.Optional; +import org.apache.helix.AccessOption; +import org.apache.helix.store.HelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.pinot.common.metadata.ZKMetadataProvider; +import org.apache.pinot.materializedview.analysis.MaterializedViewAnalyzer; +import org.apache.pinot.materializedview.context.MaterializedViewTaskGeneratorContext; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadata; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; +import org.apache.zookeeper.data.Stat; +import org.testng.annotations.Test; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + + +public class MaterializedViewTaskSchedulerTest { + + @Test + public void testAppendTimeRangeNoWhereClause() { + String sql = "SELECT col1, SUM(col2) FROM myTable GROUP BY col1"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertEquals(result, "SELECT col1, SUM(col2) FROM myTable WHERE ts >= 100 AND ts < 200 GROUP BY col1"); + } + + @Test + public void testAppendTimeRangeWithExistingWhere() { + String sql = "SELECT col1 FROM myTable WHERE col2 = 'foo' GROUP BY col1"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertEquals(result, "SELECT col1 FROM myTable WHERE col2 = 'foo' AND ts >= 100 AND ts < 200 GROUP BY col1"); + } + + @Test + public void testAppendTimeRangeWithLimit() { + String sql = "SELECT col1 FROM myTable GROUP BY col1 LIMIT 50"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertEquals(result, "SELECT col1 FROM myTable WHERE ts >= 100 AND ts < 200 GROUP BY col1 LIMIT 50"); + } + + @Test + public void testAppendTimeRangeStripsTrailingSemicolon() { + String sql = "SELECT col1 FROM myTable;"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertEquals(result, "SELECT col1 FROM myTable WHERE ts >= 100 AND ts < 200"); + } + + // --------------------------------------------------------------------------- + // appendTimeRange quote-mask: keyword scans must skip text inside string + // literals and quoted identifiers so user-controlled values cannot fool the + // splitter into corrupting the SQL. + // --------------------------------------------------------------------------- + + @Test + public void testAppendTimeRangeIgnoresKeywordsInsideStringLiterals() { + String sql = "SELECT col1 FROM myTable WHERE name = 'Acme WHERE Co' AND tag <> 'GROUP BY hack'"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + // The fake WHERE/GROUP BY inside string literals must remain untouched, and the time + // filter must AND-append at the real end of the WHERE conditions. + assertTrue(result.contains("'Acme WHERE Co'"), "literal text was modified: " + result); + assertTrue(result.contains("'GROUP BY hack'"), "literal text was modified: " + result); + assertTrue(result.endsWith("AND ts >= 100 AND ts < 200"), "filter not appended at end: " + result); + } + + @Test + public void testAppendTimeRangeHandlesAnsiDoubledSingleQuoteEscape() { + // 'It''s' is the SQL-standard escape for a single quote inside a literal. + String sql = "SELECT col1 FROM myTable WHERE comment = 'It''s a WHERE test'"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertTrue(result.contains("'It''s a WHERE test'"), "doubled-quote literal modified: " + result); + assertTrue(result.endsWith("AND ts >= 100 AND ts < 200"), "filter not appended at end: " + result); + } + + @Test + public void testAppendTimeRangeKeywordInsideDoubleQuotedIdentifier() { + // Double-quoted identifier — treated like a literal by the quote mask. + String sql = "SELECT \"WHERE\" FROM myTable"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertTrue(result.startsWith("SELECT \"WHERE\" FROM myTable"), + "quoted identifier was modified: " + result); + assertTrue(result.contains("WHERE ts >= 100"), "WHERE clause should be inserted: " + result); + } + + // Line-comment handling intentionally not exhaustively tested: + // appending text after a SQL that ends with `-- ...` would land inside the + // comment unless a newline is also inserted. The block-comment case below is + // safe because /* ... */ has a bounded end. Operators authoring definedSQL + // with trailing line comments will see a downstream parse failure (the + // verify-re-parse in buildTaskConfig catches it) rather than a silent + // injection. Block comments embedded in the middle of the SQL are safe. + + @Test + public void testAppendTimeRangeKeywordAsSubstringOfColumnName() { + // Column names that contain a SQL keyword as a substring (e.g. "WHERETO", + // "GROUPING_SET", "VARCHARLIMIT") must NOT be treated as the keyword. The + // boundary-aware scanner requires whitespace/punctuation on both sides. + String sql = "SELECT WHERETO, GROUPING_SET FROM myTable"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertEquals(result, "SELECT WHERETO, GROUPING_SET FROM myTable WHERE ts >= 100 AND ts < 200"); + } + + @Test + public void testAppendTimeRangeIgnoresKeywordsInsideBlockComment() { + String sql = "SELECT col FROM myTable /* WHERE x AND GROUP BY y */ GROUP BY col"; + String result = MaterializedViewTaskScheduler.appendTimeRange(sql, "ts", "100", "200"); + assertTrue(result.contains("/* WHERE x AND GROUP BY y */"), "block comment modified: " + result); + assertTrue(result.contains("WHERE ts >= 100 AND ts < 200 GROUP BY col"), + "filter not inserted before real GROUP BY: " + result); + } + + // --------------------------------------------------------------------------- + // LIMIT-injection contract (driven via tryExtractDeclaredLimit + the constant) + // --------------------------------------------------------------------------- + + @Test + public void testNoLimitFallsBackToDefaultMaterializedViewQueryLimit() { + String sql = "SELECT col1 FROM myTable GROUP BY col1"; + Optional declared = MaterializedViewAnalyzer.tryExtractDeclaredLimit(sql); + assertFalse(declared.isPresent(), "definedSQL has no LIMIT; should be empty"); + int effectiveLimit = declared.orElse(MaterializedViewTask.DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT); + assertEquals(effectiveLimit, 1_000_000, "fallback must equal DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT (1M)"); + } + + @Test + public void testUserDeclaredLimitIsHonored() { + String sql = "SELECT col1 FROM myTable GROUP BY col1 LIMIT 5000"; + Optional declared = MaterializedViewAnalyzer.tryExtractDeclaredLimit(sql); + assertTrue(declared.isPresent()); + assertEquals(declared.get().intValue(), 5000); + } + + // --------------------------------------------------------------------------- + // appendTimeRange + LIMIT-injection composition (load-bearing safety: the broker + // must observe the appended LIMIT, otherwise it applies its small default of 10) + // --------------------------------------------------------------------------- + + /// Helper that mirrors what generator.buildTaskConfig does for the no-LIMIT path. + private static String appendTimeRangeAndLimit(String definedSql, int limit) { + String withTimeRange = MaterializedViewTaskScheduler.appendTimeRange(definedSql, "ts", "100", "200"); + String trimmed = withTimeRange.trim(); + if (trimmed.endsWith(";")) { + trimmed = trimmed.substring(0, trimmed.length() - 1).trim(); + } + return trimmed + " LIMIT " + limit; + } + + private static void assertLimitObserved(String definedSql, int expectedLimit) { + String composed = appendTimeRangeAndLimit(definedSql, expectedLimit); + Optional observed = MaterializedViewAnalyzer.tryExtractDeclaredLimit(composed); + assertTrue(observed.isPresent(), + "Composed SQL had no parseable LIMIT — broker would silently truncate. Composed: " + composed); + assertEquals(observed.get().intValue(), expectedLimit, + "Composed SQL LIMIT mismatch. Composed: " + composed); + } + + @Test + public void testLimitInjectionGroupBy() { + assertLimitObserved("SELECT col1, count(*) FROM t GROUP BY col1", 1_000_000); + } + + @Test + public void testLimitInjectionWithExistingWhere() { + assertLimitObserved("SELECT col1 FROM t WHERE col2 = 'foo' GROUP BY col1", 1_000_000); + } + + @Test + public void testLimitInjectionWithOrderBy() { + assertLimitObserved("SELECT col1, count(*) FROM t GROUP BY col1 ORDER BY col1", 1_000_000); + } + + @Test + public void testLimitInjectionWithHaving() { + assertLimitObserved( + "SELECT col1, count(*) FROM t GROUP BY col1 HAVING count(*) > 0", 1_000_000); + } + + @Test + public void testLimitInjectionWithTrailingSemicolon() { + assertLimitObserved("SELECT col1 FROM t GROUP BY col1;", 1_000_000); + } + + @Test + public void testExistingRuntimeWithZeroWatermarkIsReturned() + throws Exception { + HelixPropertyStore propertyStore = mockPropertyStore(); + MaterializedViewTaskGeneratorContext context = mock(MaterializedViewTaskGeneratorContext.class); + when(context.getPropertyStore()).thenReturn(propertyStore); + + MaterializedViewRuntimeMetadata runtime = new MaterializedViewRuntimeMetadata( + "mv_OFFLINE", 0L, Collections.emptyMap()); + when(propertyStore.get( + eq(ZKMetadataProvider.constructPropertyStorePathForMaterializedViewRuntime("mv_OFFLINE")), + any(Stat.class), + eq(AccessOption.PERSISTENT))).thenReturn(runtime.toZNRecord()); + + MaterializedViewTaskScheduler scheduler = new MaterializedViewTaskScheduler(context); + long watermarkMs = scheduler.getWatermarkMs("mv", "orders", 86_400_000L, + "SELECT city, COUNT(*) FROM orders GROUP BY city", java.util.Map.of()); + + assertEquals(watermarkMs, 0L); + verify(context, never()).getSegmentsZKMetadata(anyString()); + verify(propertyStore, never()).set(anyString(), any(ZNRecord.class), anyInt(), eq(AccessOption.PERSISTENT)); + } + + @SuppressWarnings("unchecked") + private static HelixPropertyStore mockPropertyStore() { + return mock(HelixPropertyStore.class); + } +} diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/pom.xml b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/pom.xml index 734abff368b1..1ce2eee92fe0 100644 --- a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/pom.xml +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/pom.xml @@ -35,6 +35,10 @@ + + org.apache.pinot + pinot-materialized-view + org.apache.pinot pinot-yammer diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutor.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutor.java new file mode 100644 index 000000000000..ca77d9469e16 --- /dev/null +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutor.java @@ -0,0 +1,827 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks.materializedview; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.hash.Hasher; +import com.google.common.hash.Hashing; +import java.io.File; +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import org.apache.commons.io.FileUtils; +import org.apache.hc.core5.http.Header; +import org.apache.hc.core5.http.NameValuePair; +import org.apache.helix.HelixManager; +import org.apache.helix.model.HelixConfigScope; +import org.apache.helix.model.builder.HelixConfigScopeBuilder; +import org.apache.helix.store.HelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.helix.zookeeper.zkclient.exception.ZkException; +import org.apache.pinot.common.auth.AuthProviderUtils; +import org.apache.pinot.common.metadata.ZKMetadataProvider; +import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; +import org.apache.pinot.common.metadata.segment.SegmentZKMetadataCustomMapModifier; +import org.apache.pinot.common.restlet.resources.StartReplaceSegmentsRequest; +import org.apache.pinot.common.utils.DataSchema; +import org.apache.pinot.common.utils.FileUploadDownloadClient; +import org.apache.pinot.common.utils.TarCompressionUtils; +import org.apache.pinot.core.common.MinionConstants; +import org.apache.pinot.core.minion.PinotTaskConfig; +import org.apache.pinot.materializedview.executor.MaterializedViewQueryExecutor; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadata; +import org.apache.pinot.materializedview.metadata.MaterializedViewRuntimeMetadataUtils; +import org.apache.pinot.materializedview.metadata.PartitionFingerprint; +import org.apache.pinot.materializedview.metadata.PartitionInfo; +import org.apache.pinot.materializedview.metadata.PartitionState; +import org.apache.pinot.materializedview.scheduler.MaterializedViewTaskUtils; +import org.apache.pinot.minion.MinionConf; +import org.apache.pinot.minion.MinionContext; +import org.apache.pinot.minion.event.MinionEventObserver; +import org.apache.pinot.minion.event.MinionEventObservers; +import org.apache.pinot.minion.executor.MinionTaskZkMetadataManager; +import org.apache.pinot.plugin.minion.tasks.BaseTaskExecutor; +import org.apache.pinot.plugin.minion.tasks.SegmentConversionResult; +import org.apache.pinot.plugin.minion.tasks.SegmentConversionUtils; +import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; +import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader; +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; +import org.apache.pinot.spi.auth.AuthProvider; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; +import org.apache.pinot.spi.utils.BytesUtils; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; +import org.apache.pinot.spi.utils.Obfuscator; +import org.apache.pinot.spi.utils.builder.TableNameBuilder; +import org.apache.zookeeper.data.Stat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/// Executor for [MaterializedViewTask]. +/// +/// This task receives a SQL query with an appended time range (from the generator), +/// executes it via a pluggable [MaterializedViewQueryExecutor] (e.g. gRPC, Arrow Flight), +/// and builds a segment from the query results for the MV table. +/// +/// Lifecycle: +/// +/// - `preProcess` – validates watermark against windowStartMs +/// - `executeTask` – queries broker, builds segment, uploads +/// - `postProcess` – advances watermark to windowEndMs +/// +public class MaterializedViewTaskExecutor extends BaseTaskExecutor { + private static final Logger LOGGER = LoggerFactory.getLogger(MaterializedViewTaskExecutor.class); + + private final MinionTaskZkMetadataManager _minionTaskZkMetadataManager; + private final MinionConf _minionConf; + private final MaterializedViewQueryExecutor _queryExecutor; + + public MaterializedViewTaskExecutor(MinionTaskZkMetadataManager minionTaskZkMetadataManager, + MinionConf minionConf, MaterializedViewQueryExecutor queryExecutor) { + _minionTaskZkMetadataManager = minionTaskZkMetadataManager; + _minionConf = minionConf; + _queryExecutor = queryExecutor; + } + + public void preProcess(PinotTaskConfig pinotTaskConfig) { + Map configs = pinotTaskConfig.getConfigs(); + String tableName = configs.get(MinionConstants.TABLE_NAME_KEY); + String taskMode = configs.getOrDefault(MaterializedViewTask.TASK_MODE_KEY, + MaterializedViewTask.TASK_MODE_APPEND); + long windowStartMs = Long.parseLong(configs.get(MaterializedViewTask.WINDOW_START_MS_KEY)); + + // Fetch MaterializedViewRuntimeMetadata for watermark validation and optimistic locking + HelixPropertyStore propertyStore = MINION_CONTEXT.getHelixPropertyStore(); + Stat stat = new Stat(); + MaterializedViewRuntimeMetadata runtime = MaterializedViewRuntimeMetadataUtils.fetchWithVersion( + propertyStore, tableName, stat); + + if (runtime != null) { + if (MaterializedViewTask.TASK_MODE_APPEND.equals(taskMode)) { + Preconditions.checkState(runtime.getWatermarkMs() <= windowStartMs, + "watermarkMs %d should not be larger than windowStartMs %d for table %s", + runtime.getWatermarkMs(), windowStartMs, tableName); + } else if (MaterializedViewTask.TASK_MODE_OVERWRITE.equals(taskMode)) { + PartitionInfo partitionInfo = runtime.getPartitions().get(windowStartMs); + Preconditions.checkState(partitionInfo != null && partitionInfo.getState() == PartitionState.STALE, + "Overwrite target partition %d should exist and be STALE for table %s", + windowStartMs, tableName); + } else if (MaterializedViewTask.TASK_MODE_DELETE.equals(taskMode)) { + // DELETE is now an executor-internal cleanup triggered when an OVERWRITE finds the + // source data has been retention-deleted (empty result + zero source segments). The + // partition must exist and be STALE; the executor will remove it from the map and + // drop the corresponding MV segments. + PartitionInfo partitionInfo = runtime.getPartitions().get(windowStartMs); + Preconditions.checkState(partitionInfo != null && partitionInfo.getState() == PartitionState.STALE, + "Delete target partition %d should exist and be STALE for table %s", + windowStartMs, tableName); + } + } else { + LOGGER.warn("MaterializedViewRuntimeMetadata for table: {} not found; will be initialized in postProcess", + tableName); + } + } + + @Override + public SegmentConversionResult executeTask(PinotTaskConfig pinotTaskConfig) + throws Exception { + preProcess(pinotTaskConfig); + + MinionEventObserver eventObserver = + MinionEventObservers.getInstance().getMinionEventObserver(pinotTaskConfig.getTaskId()); + + Map configs = pinotTaskConfig.getConfigs(); + String taskType = pinotTaskConfig.getTaskType(); + if (LOGGER.isInfoEnabled()) { + LOGGER.info("Starting task: {} with configs: {}", taskType, Obfuscator.DEFAULT.toJsonString(configs)); + } + + String tableName = configs.get(MinionConstants.TABLE_NAME_KEY); + long windowStartMs = Long.parseLong(configs.get(MaterializedViewTask.WINDOW_START_MS_KEY)); + long windowEndMs = Long.parseLong(configs.get(MaterializedViewTask.WINDOW_END_MS_KEY)); + String taskMode = configs.getOrDefault(MaterializedViewTask.TASK_MODE_KEY, + MaterializedViewTask.TASK_MODE_APPEND); + + // DELETE mode: skip query execution, only remove existing MV segments + if (MaterializedViewTask.TASK_MODE_DELETE.equals(taskMode)) { + return executeDeleteTask(pinotTaskConfig, eventObserver, tableName, windowStartMs, windowEndMs); + } + PartitionFingerprint taskFingerprint = getTaskFingerprint(configs, tableName, windowStartMs); + validateSourceFingerprintAtCommit(configs, tableName, windowStartMs, windowEndMs, taskFingerprint); + + String definedSQL = configs.get(MaterializedViewTask.DEFINED_SQL_KEY); + LOGGER.info("MaterializedViewTask for table: {}, window: [{}, {}), SQL: {}", + tableName, windowStartMs, windowEndMs, definedSQL); + + TableConfig tableConfig = getTableConfig(tableName); + Schema schema = getSchema(tableName); + + eventObserver.notifyProgress(pinotTaskConfig, "Executing query for MV table: " + tableName); + AuthProvider authProvider = resolveAuthProvider(configs); + String uploadURL = configs.get(MinionConstants.UPLOAD_URL_KEY); + Map authHeaders = AuthProviderUtils.makeAuthHeadersMap(authProvider); + + String maxRecordsStr = configs.get(MaterializedViewTask.MAX_NUM_RECORDS_PER_SEGMENT_KEY); + int maxNumRecordsPerSegment = maxRecordsStr != null + ? Integer.parseInt(maxRecordsStr) + : MaterializedViewTask.DEFAULT_MAX_NUM_RECORDS_PER_SEGMENT; + int effectiveLimit = MaterializedViewTaskUtils.parseEffectiveLimit(configs, tableName); + + // Generate a per-attempt UUID so segment names are unique across retries of the same window. + // Helix reuses the same subtask id (PinotTaskConfig#getTaskId) on every retry, so we cannot + // rely on taskId for uniqueness — a retry after a partial upload would reproduce identical + // names and the controller would reject the new lineage entry. + String attemptId = UUID.randomUUID().toString(); + + File tempDir = new File(FileUtils.getTempDirectory(), + "materialized_view_task_" + tableName + "_" + attemptId); + FileUtils.forceMkdir(tempDir); + + try { + // Phase 1: Stream the gRPC response one frame at a time and build segments chunk-by-chunk. + // Heap residency is bounded by `maxNumRecordsPerSegment` rows (one in-flight segment's + // worth of buffered GenericRows) regardless of total window size. Saturation against + // `effectiveLimit` is checked as rows arrive, so an over-limit window fails BEFORE + // committing any segment. + List conversionResults = new ArrayList<>(); + List tarFiles = new ArrayList<>(); + long totalRows = 0L; + int segIdx = 0; + + try (MaterializedViewQueryExecutor.QueryHandle queryHandle = + _queryExecutor.executeQuery(definedSQL, authHeaders)) { + DataSchema dataSchema = queryHandle.getDataSchema(); + FieldSpec[] fieldSpecs = resolveOutputFieldSpecs(dataSchema, schema); + Iterator rowIterator = queryHandle.rows(); + + List chunkBuffer = new ArrayList<>(Math.min(maxNumRecordsPerSegment, 4096)); + while (rowIterator.hasNext()) { + chunkBuffer.add(toGenericRow(dataSchema.getColumnNames(), fieldSpecs, rowIterator.next())); + totalRows++; + if (chunkBuffer.size() >= maxNumRecordsPerSegment) { + buildSegmentForChunk(tableName, windowStartMs, windowEndMs, attemptId, segIdx, + chunkBuffer, tableConfig, schema, tempDir, conversionResults, tarFiles, + eventObserver, pinotTaskConfig); + chunkBuffer = new ArrayList<>(maxNumRecordsPerSegment); + segIdx++; + } + } + if (!chunkBuffer.isEmpty()) { + buildSegmentForChunk(tableName, windowStartMs, windowEndMs, attemptId, segIdx, + chunkBuffer, tableConfig, schema, tempDir, conversionResults, tarFiles, + eventObserver, pinotTaskConfig); + segIdx++; + } + } + + // Completeness gate: the broker enforces LIMIT by truncating at exactly N rows, so a query + // that genuinely has more than N rows returns exactly N. Treat `totalRows == effectiveLimit` + // as a saturation: we cannot distinguish "exactly N" from "≥ N truncated". Fail BEFORE + // any segment is committed via lineage so the partition is not marked VALID against + // truncated data; the chunk-build path above only stages files to disk, no ZK / lineage + // mutation has happened yet. + if (totalRows >= effectiveLimit) { + MaterializedViewTaskUtils.failOnSaturation(tableName, windowStartMs, windowEndMs, + totalRows, effectiveLimit); + } + + LOGGER.info("Query streamed {} rows for table: {} into {} segment(s)", + totalRows, tableName, conversionResults.size()); + + if (totalRows == 0L) { + LOGGER.info("No data returned for window [{}, {}) of table: {}.", windowStartMs, windowEndMs, tableName); + if (MaterializedViewTask.TASK_MODE_OVERWRITE.equals(taskMode)) { + validateSourceFingerprintAtCommit(configs, tableName, windowStartMs, windowEndMs, taskFingerprint); + replaceWindowSegments(tableName, windowStartMs, windowEndMs, Collections.emptyList(), + uploadURL, authProvider); + } + postProcess(pinotTaskConfig); + return new SegmentConversionResult.Builder() + .setTableNameWithType(tableName) + .build(); + } + + int numSegments = conversionResults.size(); + + // Phase 2: Segment lineage — find old segments and start replace + List segmentsTo = new ArrayList<>(); + for (SegmentConversionResult r : conversionResults) { + segmentsTo.add(r.getSegmentName()); + } + + validateSourceFingerprintAtCommit(configs, tableName, windowStartMs, windowEndMs, taskFingerprint); + String lineageEntryId = + startWindowSegmentReplace(tableName, windowStartMs, windowEndMs, segmentsTo, uploadURL, authProvider); + + try { + // Phase 3: Upload all segments + for (int i = 0; i < conversionResults.size(); i++) { + SegmentConversionResult result = conversionResults.get(i); + File tarFile = tarFiles.get(i); + String segmentName = result.getSegmentName(); + + eventObserver.notifyProgress(pinotTaskConfig, + String.format("Uploading segment %d/%d: %s", i + 1, numSegments, segmentName)); + + List
httpHeaders = getSegmentPushMetadataHeaders(pinotTaskConfig, authProvider, result); + List parameters = getSegmentPushCommonParams(tableName); + SegmentConversionUtils.uploadSegment(configs, httpHeaders, parameters, tableName, segmentName, + uploadURL, tarFile); + + reportSegmentUploadMetrics(result.getFile(), tableName, taskType); + + LOGGER.info("Successfully uploaded segment {}/{}: {} for table: {}", + i + 1, numSegments, segmentName, tableName); + } + + // Phase 4: End segment replace to atomically swap lineage + if (lineageEntryId != null) { + validateSourceFingerprintAtCommit(configs, tableName, windowStartMs, windowEndMs, taskFingerprint); + SegmentConversionUtils.endSegmentReplace( + tableName, uploadURL, lineageEntryId, + _minionConf.getEndReplaceSegmentsTimeoutMs(), authProvider); + LOGGER.info("Ended segment replace for table: {}, lineageEntryId: {}", tableName, lineageEntryId); + } + } catch (Exception e) { + // Best-effort revert of the IN_PROGRESS lineage entry so the next attempt is not blocked. + // If revert itself fails, the next startSegmentReplace will mark the previous entry as + // REVERTED and clean up leftover segments — same recovery contract as ConsistentDataPushUtils. + if (lineageEntryId != null) { + revertWindowSegmentReplace(tableName, lineageEntryId, uploadURL, authProvider); + } + throw e; + } + + postProcess(pinotTaskConfig); + + return conversionResults.get(conversionResults.size() - 1); + } finally { + FileUtils.deleteQuietly(tempDir); + } + } + + private void replaceWindowSegments(String tableName, long windowStartMs, long windowEndMs, List segmentsTo, + String uploadURL, AuthProvider authProvider) + throws Exception { + String lineageEntryId = + startWindowSegmentReplace(tableName, windowStartMs, windowEndMs, segmentsTo, uploadURL, authProvider); + if (lineageEntryId != null) { + SegmentConversionUtils.endSegmentReplace( + tableName, uploadURL, lineageEntryId, _minionConf.getEndReplaceSegmentsTimeoutMs(), authProvider); + LOGGER.info("Ended segment replace for table: {}, lineageEntryId: {}", tableName, lineageEntryId); + } + } + + private String startWindowSegmentReplace(String tableName, long windowStartMs, long windowEndMs, + List segmentsTo, String uploadURL, AuthProvider authProvider) + throws Exception { + String segmentPrefix = tableName + "_" + windowStartMs + "_" + windowEndMs; + Set allExistingSegments = SegmentConversionUtils.getSegmentNamesForTable( + tableName, new URI(uploadURL).resolve("/"), authProvider); + List segmentsFrom = new ArrayList<>(); + for (String name : allExistingSegments) { + if (name.equals(segmentPrefix) || name.startsWith(segmentPrefix + "_")) { + segmentsFrom.add(name); + } + } + + if (segmentsFrom.isEmpty() && segmentsTo.isEmpty()) { + return null; + } + String lineageEntryId = SegmentConversionUtils.startSegmentReplace( + tableName, uploadURL, new StartReplaceSegmentsRequest(segmentsFrom, segmentsTo), authProvider); + LOGGER.info("Started segment replace for table: {}, lineageEntryId: {}, segmentsFrom: {}, segmentsTo: {}", + tableName, lineageEntryId, segmentsFrom, segmentsTo); + return lineageEntryId; + } + + /// Best-effort revert of a started segment replace lineage entry. + /// + /// Used when the upload phase or `endSegmentReplace` throws after a successful + /// `startSegmentReplace`, to avoid an orphaned IN_PROGRESS lineage entry that would + /// block subsequent task retries. Always treats the table as OFFLINE — MV tables are + /// always offline by construction. + /// + /// Failures are swallowed (logged only): if the controller is unreachable now, the + /// next `startSegmentReplace` call will mark the previous entry as REVERTED and clean + /// up any leftover segments. + private void revertWindowSegmentReplace(String tableNameWithType, String lineageEntryId, String uploadURL, + AuthProvider authProvider) { + String rawTableName = TableNameBuilder.extractRawTableName(tableNameWithType); + try (FileUploadDownloadClient client = new FileUploadDownloadClient()) { + URI revertUri = FileUploadDownloadClient.getRevertReplaceSegmentsURI( + new URI(uploadURL), rawTableName, TableType.OFFLINE.name(), lineageEntryId, true); + client.revertReplaceSegments(revertUri, authProvider); + LOGGER.info("Reverted segment replace for table: {}, lineageEntryId: {}", tableNameWithType, lineageEntryId); + } catch (Exception revertException) { + LOGGER.error("Failed to revert segment replace for table: {}, lineageEntryId: {}. Next " + + "startSegmentReplace will clean up the orphaned entry.", + tableNameWithType, lineageEntryId, revertException); + } + } + + /// Handles DELETE mode: removes all existing MV segments for the given time window + /// via segment lineage replace (segmentsFrom=[old segments], segmentsTo=[]). + /// No query is executed and no new segments are created. + private SegmentConversionResult executeDeleteTask(PinotTaskConfig pinotTaskConfig, + MinionEventObserver eventObserver, String tableName, long windowStartMs, long windowEndMs) + throws Exception { + Map configs = pinotTaskConfig.getConfigs(); + String uploadURL = configs.get(MinionConstants.UPLOAD_URL_KEY); + AuthProvider authProvider = resolveAuthProvider(configs); + + LOGGER.info("DELETE task for table: {}, window: [{}, {}). Removing MV segments.", + tableName, windowStartMs, windowEndMs); + eventObserver.notifyProgress(pinotTaskConfig, + "Deleting MV segments for window [" + windowStartMs + ", " + windowEndMs + ")"); + + String segmentPrefix = tableName + "_" + windowStartMs + "_" + windowEndMs; + Set allExistingSegments = SegmentConversionUtils.getSegmentNamesForTable( + tableName, new URI(uploadURL).resolve("/"), authProvider); + List segmentsFrom = new ArrayList<>(); + for (String name : allExistingSegments) { + if (name.equals(segmentPrefix) || name.startsWith(segmentPrefix + "_")) { + segmentsFrom.add(name); + } + } + + if (!segmentsFrom.isEmpty()) { + List segmentsTo = Collections.emptyList(); + String lineageEntryId = SegmentConversionUtils.startSegmentReplace( + tableName, uploadURL, + new StartReplaceSegmentsRequest(segmentsFrom, segmentsTo), + authProvider); + LOGGER.info("Started segment delete-replace for table: {}, lineageEntryId: {}, segmentsFrom: {}", + tableName, lineageEntryId, segmentsFrom); + + SegmentConversionUtils.endSegmentReplace( + tableName, uploadURL, lineageEntryId, + _minionConf.getEndReplaceSegmentsTimeoutMs(), authProvider); + LOGGER.info("Ended segment delete-replace for table: {}, lineageEntryId: {}", tableName, lineageEntryId); + } else { + LOGGER.info("No existing segments found for prefix: {} in table: {}. Nothing to delete.", + segmentPrefix, tableName); + } + + postProcess(pinotTaskConfig); + + return new SegmentConversionResult.Builder() + .setTableNameWithType(tableName) + .build(); + } + + public void postProcess(PinotTaskConfig pinotTaskConfig) { + Map configs = pinotTaskConfig.getConfigs(); + String tableName = configs.get(MinionConstants.TABLE_NAME_KEY); + String taskMode = configs.getOrDefault(MaterializedViewTask.TASK_MODE_KEY, + MaterializedViewTask.TASK_MODE_APPEND); + long windowStartMs = Long.parseLong(configs.get(MaterializedViewTask.WINDOW_START_MS_KEY)); + long windowEndMs = Long.parseLong(configs.get(MaterializedViewTask.WINDOW_END_MS_KEY)); + + updateMaterializedViewRuntime(configs, tableName, taskMode, windowStartMs, windowEndMs); + } + + /// Updates [MaterializedViewRuntimeMetadata] in a single CAS write, combining: + /// + /// - partitions: set VALID with new fingerprint (APPEND/OVERWRITE) or remove (DELETE) + /// - watermarkMs: advance on APPEND only (drives both scheduler dispatch and the + /// broker's SPLIT_REWRITE boundary) + /// + // Compile-time default for the CAS retry budget when racing to update MaterializedViewRuntimeMetadata. + // Up to maxTasksPerBatch executors can contend per batch completion; each retry re-fetches the + // latest version with jittered backoff (Thread.sleep below). 128 is well above any realistic + // maxTasksPerBatch and stays low enough that genuinely pathological contention still surfaces as + // a task failure (caught by Helix and retried at the task level). Overridable per cluster via + // `MaterializedViewTask.CLUSTER_CONFIG_KEY_MAX_RUNTIME_UPDATE_ATTEMPTS` (no minion restart). + private static final int DEFAULT_MAX_RUNTIME_UPDATE_ATTEMPTS = 128; + + /// Reads a single Helix CLUSTER-scope config value via the minion's `HelixManager`. Returns + /// `null` when the key is unset or the Helix manager has not yet been initialized. Used by + /// the executor (and other minion-side MV consumers) to pick up live cluster-config overrides + /// without a restart. + static String readMinionClusterConfig(String configName) { + try { + HelixManager helixManager = MinionContext.getInstance().getHelixManager(); + if (helixManager == null) { + return null; + } + HelixConfigScope scope = new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.CLUSTER) + .forCluster(helixManager.getClusterName()) + .build(); + Map values = + helixManager.getClusterManagmentTool().getConfig(scope, Collections.singletonList(configName)); + return values == null ? null : values.get(configName); + } catch (Exception e) { + LOGGER.debug("Failed to read minion cluster config '{}': {}", configName, e.getMessage()); + return null; + } + } + + private void updateMaterializedViewRuntime(Map configs, String tableName, + String taskMode, long windowStartMs, long windowEndMs) { + HelixPropertyStore propertyStore = MINION_CONTEXT.getHelixPropertyStore(); + int maxRuntimeUpdateAttempts = MaterializedViewTaskUtils.readPositiveIntClusterConfigOrDefault( + MaterializedViewTaskExecutor::readMinionClusterConfig, + MaterializedViewTask.CLUSTER_CONFIG_KEY_MAX_RUNTIME_UPDATE_ATTEMPTS, + DEFAULT_MAX_RUNTIME_UPDATE_ATTEMPTS); + + // Compute the new fingerprint and validate against the source ONCE, outside the CAS loop. + // Both operations are deterministic given the source-side ZK state: if validation fails on + // attempt 1 (real source drift), retrying cannot succeed and would mask the actionable error + // behind a generic "Failed after N attempts" message. The CAS retry only exists to absorb + // concurrent ConsistencyManager STALE markings on the runtime znode itself. + PartitionFingerprint newFingerprint = null; + if (!MaterializedViewTask.TASK_MODE_DELETE.equals(taskMode)) { + newFingerprint = getTaskFingerprint(configs, tableName, windowStartMs); + validateSourceFingerprintAtCommit(configs, tableName, windowStartMs, windowEndMs, newFingerprint); + } + + ZkException lastCasException = null; + for (int attempt = 0; attempt < maxRuntimeUpdateAttempts; attempt++) { + if (attempt > 0) { + // Jittered backoff to avoid thundering herd against ZK when batched APPEND tasks + // race for the same MaterializedViewRuntimeMetadata znode (see maxRuntimeUpdateAttempts comment). + try { + Thread.sleep(50L + ThreadLocalRandom.current().nextInt(150)); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while retrying MV runtime update for table: " + tableName, ie); + } + LOGGER.warn("Retrying MV runtime update for table: {} (attempt {}/{})", tableName, attempt + 1, + maxRuntimeUpdateAttempts); + } + try { + // Re-fetch with version on every attempt to pick up concurrent ConsistencyManager + // updates (e.g. STALE markings) that may have arrived since the previous attempt. + Stat freshStat = new Stat(); + MaterializedViewRuntimeMetadata existing = + MaterializedViewRuntimeMetadataUtils.fetchWithVersion(propertyStore, tableName, freshStat); + int writeVersion = (existing != null) ? freshStat.getVersion() : -1; + + Map mergedInfos; + long existingWatermarkMs; + + if (existing != null) { + mergedInfos = new HashMap<>(existing.getPartitions()); + existingWatermarkMs = existing.getWatermarkMs(); + } else { + mergedInfos = new HashMap<>(); + existingWatermarkMs = 0L; + } + + long newWatermarkMs; + + if (MaterializedViewTask.TASK_MODE_DELETE.equals(taskMode)) { + mergedInfos.remove(windowStartMs); + newWatermarkMs = existingWatermarkMs; + LOGGER.info("DELETE mode: removed partition {} from MV runtime for table: {}", windowStartMs, tableName); + } else { + long nowMs = System.currentTimeMillis(); + PartitionInfo completedInfo = new PartitionInfo(PartitionState.VALID, newFingerprint, nowMs); + mergedInfos.put(windowStartMs, completedInfo); + LOGGER.info("Set partition {} to VALID (lastRefreshTime={}) for table: {}", windowStartMs, nowMs, tableName); + + if (MaterializedViewTask.TASK_MODE_APPEND.equals(taskMode)) { + // Advance to the highest contiguous VALID block starting from the existing watermark. + // Concurrent batch tasks may complete out of order; only advancing to windowEndMs would + // leave gaps when an earlier window hasn't finished yet. bucketMs is derived from the + // task's window length (one bucket per APPEND task by construction). + long bucketMs = windowEndMs - windowStartMs; + Preconditions.checkState(bucketMs > 0, + "Invalid window: windowEndMs (%s) <= windowStartMs (%s) for table %s", + windowEndMs, windowStartMs, tableName); + newWatermarkMs = MaterializedViewTaskUtils.computeContiguousUpperMs(existingWatermarkMs, mergedInfos, + bucketMs); + LOGGER.info("APPEND mode: advancing watermarkMs from {} to {} for table: {}", + existingWatermarkMs, newWatermarkMs, tableName); + } else { + newWatermarkMs = existingWatermarkMs; + LOGGER.info("OVERWRITE mode: keeping watermarkMs at {} for table: {}", newWatermarkMs, tableName); + } + } + + MaterializedViewRuntimeMetadata updated = new MaterializedViewRuntimeMetadata( + tableName, newWatermarkMs, mergedInfos); + MaterializedViewRuntimeMetadataUtils.persist(propertyStore, updated, writeVersion); + + LOGGER.info("Updated MV runtime for table: {} (partitions={}, watermarkMs={})", + tableName, mergedInfos.size(), newWatermarkMs); + return; + } catch (ZkException e) { + // Only ZK CAS conflicts and transient ZK errors are retried. Non-ZK failures (e.g. + // IllegalStateException from invariant checks, NullPointerException) are programming + // bugs that retrying cannot resolve — let them propagate so the operator sees the real cause. + lastCasException = e; + LOGGER.warn("ZK conflict while updating MV runtime for table: {} on attempt {}", tableName, attempt + 1, e); + } + } + throw new RuntimeException( + "Failed to update MV runtime for table: " + tableName + " after " + maxRuntimeUpdateAttempts + " attempts", + lastCasException); + } + + private PartitionFingerprint getTaskFingerprint(Map configs, String tableName, long windowStartMs) { + String fingerprintStr = configs.get(MaterializedViewTask.PARTITION_FINGERPRINTS_KEY); + Preconditions.checkState(fingerprintStr != null && !fingerprintStr.isEmpty(), + "Missing source partition fingerprint for MV task table %s windowStartMs %s", tableName, windowStartMs); + Map taskFingerprints = PartitionFingerprint.decodeMap(fingerprintStr); + PartitionFingerprint fingerprint = taskFingerprints.get(windowStartMs); + Preconditions.checkState(fingerprint != null, + "Missing source partition fingerprint for MV task table %s windowStartMs %s", tableName, windowStartMs); + return fingerprint; + } + + private void validateSourceFingerprintAtCommit(Map configs, String tableName, long windowStartMs, + long windowEndMs, PartitionFingerprint taskFingerprint) { + String sourceTableName = configs.get(MaterializedViewTask.SOURCE_TABLE_NAME_KEY); + Preconditions.checkState(sourceTableName != null && !sourceTableName.isEmpty(), + "Missing source table name for MV task table %s window [%s, %s)", tableName, windowStartMs, windowEndMs); + String sourceTableWithType = resolveSourceTableNameWithType(sourceTableName); + PartitionFingerprint currentFingerprint = computeWindowFingerprint( + ZKMetadataProvider.getSegmentsZKMetadata(MINION_CONTEXT.getHelixPropertyStore(), sourceTableWithType), + windowStartMs, windowEndMs); + Preconditions.checkState(taskFingerprint.equals(currentFingerprint), + "Source table %s changed while refreshing MV table %s window [%s, %s): taskFingerprint=%s, " + + "currentFingerprint=%s. Leaving MV partition stale for retry.", + sourceTableWithType, tableName, windowStartMs, windowEndMs, taskFingerprint, currentFingerprint); + } + + private String resolveSourceTableNameWithType(String sourceTableName) { + TableType tableType = TableNameBuilder.getTableTypeFromTableName(sourceTableName); + if (tableType != null) { + return sourceTableName; + } + String rawSourceTableName = TableNameBuilder.extractRawTableName(sourceTableName); + String offlineTableName = TableNameBuilder.OFFLINE.tableNameWithType(rawSourceTableName); + if (ZKMetadataProvider.getTableConfig(MINION_CONTEXT.getHelixPropertyStore(), offlineTableName) != null) { + return offlineTableName; + } + String realtimeTableName = TableNameBuilder.REALTIME.tableNameWithType(rawSourceTableName); + Preconditions.checkState( + ZKMetadataProvider.getTableConfig(MINION_CONTEXT.getHelixPropertyStore(), realtimeTableName) != null, + "Source table config not found for: %s", sourceTableName); + return realtimeTableName; + } + + /// Computes a [PartitionFingerprint] for the segments that overlap [windowStartMs, windowEndMs). + /// + /// The fingerprint is `Hashing.farmHashFingerprint64` over the sorted concatenation of + /// `\0\n` lines. Sorting by segment name makes the hash insensitive to + /// listing order. FarmHash64 is non-cryptographic but collision-resistant for non-adversarial + /// inputs; in particular it avoids the cancellation pathology of XOR-CRC (where swapping + /// one segment for another with the same XOR contribution produces an identical fingerprint). + @VisibleForTesting + static PartitionFingerprint computeWindowFingerprint(List allSegments, + long windowStartMs, long windowEndMs) { + List overlapping = new ArrayList<>(); + for (SegmentZKMetadata seg : allSegments) { + long segStartMs = seg.getStartTimeMs(); + long segEndMs = seg.getEndTimeMs(); + if (segStartMs < windowEndMs && segEndMs >= windowStartMs) { + overlapping.add(seg); + } + } + overlapping.sort(Comparator.comparing(SegmentZKMetadata::getSegmentName)); + + Hasher hasher = Hashing.farmHashFingerprint64().newHasher(); + for (SegmentZKMetadata seg : overlapping) { + hasher.putString(seg.getSegmentName(), StandardCharsets.UTF_8); + hasher.putByte((byte) 0); + hasher.putLong(seg.getCrc()); + hasher.putByte((byte) '\n'); + } + return new PartitionFingerprint(overlapping.size(), hasher.hash().asLong()); + } + + /// Returns the highest contiguous VALID upper boundary starting from `fromMs`. + /// + /// When batch APPEND tasks run concurrently, windows may complete out of order. + /// Advancing `watermarkMs` only to the just-completed `windowEndMs` would + /// regress coverage if an earlier window hasn't finished yet. This method scans + /// `partitions` for an unbroken chain of VALID windows beginning at `fromMs` + /// and returns the end of the last VALID window in that chain. + /// + /// Bounded by `partitions.size()` iterations to defend against pathological maps. + @VisibleForTesting + static long computeContiguousUpperMs(long fromMs, Map partitions, long bucketMs) { + return MaterializedViewTaskUtils.computeContiguousUpperMs(fromMs, partitions, bucketMs); + } + + @Override + protected SegmentZKMetadataCustomMapModifier getSegmentZKMetadataCustomMapModifier( + PinotTaskConfig pinotTaskConfig, SegmentConversionResult segmentConversionResult) { + return new SegmentZKMetadataCustomMapModifier( + SegmentZKMetadataCustomMapModifier.ModifyMode.UPDATE, Collections.emptyMap()); + } + + /// Fails the task if the query result set saturated the declared `LIMIT`, since that + /// strongly suggests the window was truncated and the resulting MV would be incomplete. + /// + /// Throwing here (before any segment build or `postProcess`) ensures: + /// + /// - the partition is NOT marked [PartitionState#VALID]; + /// - the runtime `watermarkMs` / partitions map are NOT advanced, so the broker will not + /// rewrite subsequent queries against the incomplete MV; + /// - Helix retries the task, letting transient causes self-heal. + /// + /// + /// If the config is missing (older tasks in flight during rolling upgrade) or non-positive, + /// the task fails loud — silently disabling the saturation gate is exactly the silent-truncation + /// regression this guard protects against. Helix retries with the same task config, so a + /// pre-upgrade task without `EFFECTIVE_LIMIT_KEY` will exhaust its retry budget and + /// surface as a failed task; the operator must regenerate the task once the controller is + /// upgraded. Documented upgrade order: upgrade controller before minion executors. + @VisibleForTesting + static void verifyResultNotTruncated(Map configs, String tableName, + long windowStartMs, long windowEndMs, int actualRows) { + MaterializedViewTaskUtils.verifyResultNotTruncated(configs, tableName, windowStartMs, windowEndMs, actualRows); + } + + /// Converts raw query result rows into [GenericRow] objects using column names + /// from the [DataSchema]. + /// + /// Each column name returned by the query must exist in the MV [Schema]. A + /// mismatch indicates the `definedSQL` produced a column the MV table cannot store + /// (e.g. an alias was renamed, the schema is out of date, or the analyzer mapping is + /// stale); proceeding would silently drop the column from the persisted segment, so we + /// fail loud instead. The analyzer enforces this invariant at table-create time, so + /// hitting it at runtime points at a real correctness drift. + /// Resolves a per-column `FieldSpec[]` once for the streaming convert loop. Fails loud if the + /// gRPC response includes a column not declared in the MV schema — the analyzer enforces this + /// invariant at table-create time, so hitting it at runtime points at a real correctness drift. + private static FieldSpec[] resolveOutputFieldSpecs(DataSchema dataSchema, Schema schema) { + String[] columnNames = dataSchema.getColumnNames(); + FieldSpec[] fieldSpecs = new FieldSpec[columnNames.length]; + for (int i = 0; i < columnNames.length; i++) { + String columnName = columnNames[i]; + FieldSpec fieldSpec = schema.getFieldSpecFor(columnName); + Preconditions.checkState(fieldSpec != null, + "MV query returned column '%s' which is not declared in the MV schema. " + + "Update the MV schema to include this column or fix the definedSQL projection.", + columnName); + fieldSpecs[i] = fieldSpec; + } + return fieldSpecs; + } + + /// Converts a single gRPC-returned row into a `GenericRow` using the pre-resolved field-spec + /// array. Allocation cost: one `GenericRow` + one `HashMap` per row; pre-allocated structures + /// (`columnNames`, `fieldSpecs`) are reused across rows. + private static GenericRow toGenericRow(String[] columnNames, FieldSpec[] fieldSpecs, Object[] row) { + GenericRow genericRow = new GenericRow(); + for (int i = 0; i < columnNames.length; i++) { + String columnName = columnNames[i]; + Object value = row[i]; + FieldSpec fieldSpec = fieldSpecs[i]; + if (fieldSpec.getDataType().getStoredType() == FieldSpec.DataType.BYTES) { + value = decodeBytesValue(columnName, value); + } + genericRow.putValue(columnName, value); + } + return genericRow; + } + + /// Builds, tars, and registers a single segment from one chunk of buffered rows. Mutates + /// `conversionResults` and `tarFiles` in place so the upload phase can drive them as a + /// flat list — same shape the previous list-based implementation used. + private void buildSegmentForChunk(String tableName, long windowStartMs, long windowEndMs, + String attemptId, int segIdx, List chunk, TableConfig tableConfig, Schema schema, + File tempDir, List conversionResults, List tarFiles, + MinionEventObserver eventObserver, PinotTaskConfig pinotTaskConfig) + throws Exception { + String segmentName = MaterializedViewTaskUtils.buildSegmentName( + tableName, windowStartMs, windowEndMs, attemptId, segIdx); + + File segmentOutputDir = new File(tempDir, "segmentOutput_" + segIdx); + FileUtils.forceMkdir(segmentOutputDir); + + SegmentGeneratorConfig segmentGeneratorConfig = new SegmentGeneratorConfig(tableConfig, schema); + segmentGeneratorConfig.setTableName(tableName); + segmentGeneratorConfig.setOutDir(segmentOutputDir.getAbsolutePath()); + segmentGeneratorConfig.setSegmentName(segmentName); + + eventObserver.notifyProgress(pinotTaskConfig, + String.format("Building segment %d: %s (%d rows)", segIdx, segmentName, chunk.size())); + + SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); + driver.init(segmentGeneratorConfig, new GenericRowRecordReader(chunk)); + driver.build(); + + File segmentDir = new File(segmentOutputDir, segmentName); + Preconditions.checkState(segmentDir.exists(), "Segment generation failed for: %s", segmentName); + + File segmentTarFile = new File(tempDir, segmentName + TarCompressionUtils.TAR_GZ_FILE_EXTENSION); + TarCompressionUtils.createCompressedTarFile(segmentDir, segmentTarFile); + + conversionResults.add(new SegmentConversionResult.Builder() + .setFile(segmentDir) + .setSegmentName(segmentName) + .setTableNameWithType(tableName) + .build()); + tarFiles.add(segmentTarFile); + } + + @VisibleForTesting + static Object decodeBytesValue(String columnName, Object value) { + if (value == null || value instanceof byte[]) { + return value; + } + if (!(value instanceof String)) { + return value; + } + String stringValue = (String) value; + try { + return BytesUtils.toBytes(stringValue); + } catch (IllegalArgumentException hexException) { + try { + return Base64.getDecoder().decode(stringValue); + } catch (IllegalArgumentException base64Exception) { + base64Exception.addSuppressed(hexException); + throw new IllegalArgumentException( + "Cannot decode BYTES value for column: " + columnName + " as hex or base64", base64Exception); + } + } + } + + /// Builds a segment name that is stable within a single attempt but unique across retries of the + /// same window. The `attemptId` must be a per-invocation value (e.g., a fresh UUID) and + /// must NOT be the Helix subtask id, which is reused across retries. Using the Helix subtask id + /// would reproduce identical names on retry, causing the controller to reject the new lineage + /// entry when segments from a previous partial attempt already exist. + @VisibleForTesting + static String buildSegmentName(String tableName, long windowStartMs, long windowEndMs, + String attemptId, int segIdx) { + return MaterializedViewTaskUtils.buildSegmentName(tableName, windowStartMs, windowEndMs, attemptId, segIdx); + } +} diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorFactory.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorFactory.java new file mode 100644 index 000000000000..fb54bb30c7fd --- /dev/null +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorFactory.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks.materializedview; + +import org.apache.pinot.common.config.GrpcConfig; +import org.apache.pinot.materializedview.executor.GrpcMaterializedViewQueryExecutor; +import org.apache.pinot.materializedview.executor.MaterializedViewQueryExecutor; +import org.apache.pinot.minion.MinionConf; +import org.apache.pinot.minion.MinionContext; +import org.apache.pinot.minion.executor.MinionTaskZkMetadataManager; +import org.apache.pinot.minion.executor.PinotTaskExecutor; +import org.apache.pinot.minion.executor.PinotTaskExecutorFactory; +import org.apache.pinot.spi.annotations.minion.TaskExecutorFactory; +import org.apache.pinot.spi.env.PinotConfiguration; +import org.apache.pinot.spi.utils.CommonConstants; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; + + +/// Factory for creating [MaterializedViewTaskExecutor] instances. +/// +/// Creates a long-lived [GrpcMaterializedViewQueryExecutor] lazily on the first +/// [#create()] call (not during [#init]) because the +/// [MinionContext#getHelixManager()] is only available after the minion +/// has connected to Helix, which happens after factory initialization. +/// The executor is then shared across all executor instances for connection +/// reuse and load balancing. +@TaskExecutorFactory +public class MaterializedViewTaskExecutorFactory implements PinotTaskExecutorFactory { + private MinionTaskZkMetadataManager _zkMetadataManager; + private MinionConf _minionConf; + private volatile MaterializedViewQueryExecutor _queryExecutor; + + @Override + public void init(MinionTaskZkMetadataManager zkMetadataManager) { + _zkMetadataManager = zkMetadataManager; + } + + @Override + public void init(MinionTaskZkMetadataManager zkMetadataManager, MinionConf minionConf) { + _zkMetadataManager = zkMetadataManager; + _minionConf = minionConf; + } + + @Override + public String getTaskType() { + return CommonConstants.MaterializedViewTask.TASK_TYPE; + } + + @Override + public PinotTaskExecutor create() { + if (_queryExecutor == null) { + synchronized (this) { + if (_queryExecutor == null) { + // Build the gRPC client config from the minion's own configuration, scoped to the + // MaterializedViewTask.MINION_BROKER_GRPC_CONFIG_PREFIX prefix. This is how operators + // enable TLS, raise the max inbound message size for large MV result sets, and tune + // keepalive. Falling back to an empty configuration (no TLS, defaults) when the minion + // was initialized via the legacy single-arg init(zkMetadataManager) overload — fine for + // local tests but production deployments should use the two-arg init. + PinotConfiguration grpcClientConfig = _minionConf != null + ? _minionConf.subset(MaterializedViewTask.MINION_BROKER_GRPC_CONFIG_PREFIX) + : new PinotConfiguration(); + _queryExecutor = new GrpcMaterializedViewQueryExecutor( + MinionContext.getInstance().getHelixManager(), + new GrpcConfig(grpcClientConfig)); + } + } + } + return new MaterializedViewTaskExecutor(_zkMetadataManager, _minionConf, _queryExecutor); + } +} diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskGenerator.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskGenerator.java new file mode 100644 index 000000000000..e515b97f416a --- /dev/null +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskGenerator.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks.materializedview; + +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import org.apache.helix.store.HelixPropertyStore; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; +import org.apache.pinot.controller.helix.core.minion.ClusterInfoAccessor; +import org.apache.pinot.controller.helix.core.minion.generator.BaseTaskGenerator; +import org.apache.pinot.controller.helix.core.minion.generator.TaskGeneratorUtils; +import org.apache.pinot.core.minion.PinotTaskConfig; +import org.apache.pinot.materializedview.context.MaterializedViewTaskGeneratorContext; +import org.apache.pinot.materializedview.scheduler.MaterializedViewTaskScheduler; +import org.apache.pinot.spi.annotations.minion.TaskGenerator; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.CommonConstants; + + +/// Pinot minion plugin wiring for the materialized-view task scheduler. +@TaskGenerator +public class MaterializedViewTaskGenerator extends BaseTaskGenerator { + + @Override + public String getTaskType() { + return CommonConstants.MaterializedViewTask.TASK_TYPE; + } + + @Override + public List generateTasks(List tableConfigs) { + return scheduler().generateTasks(tableConfigs); + } + + @Override + public void validateTaskConfigs(TableConfig tableConfig, Schema schema, Map taskConfigs) { + scheduler().validateTaskConfigs(tableConfig, schema, taskConfigs); + } + + private MaterializedViewTaskScheduler scheduler() { + return new MaterializedViewTaskScheduler(new ControllerTaskGeneratorContext(_clusterInfoAccessor)); + } + + private static final class ControllerTaskGeneratorContext implements MaterializedViewTaskGeneratorContext { + private final ClusterInfoAccessor _clusterInfoAccessor; + + private ControllerTaskGeneratorContext(ClusterInfoAccessor clusterInfoAccessor) { + _clusterInfoAccessor = clusterInfoAccessor; + } + + @Override + public HelixPropertyStore getPropertyStore() { + return _clusterInfoAccessor.getPinotHelixResourceManager().getPropertyStore(); + } + + @Override + public List getSegmentsZKMetadata(String tableNameWithType) { + return _clusterInfoAccessor.getSegmentsZKMetadata(tableNameWithType); + } + + @Override + public String getVipUrl() { + return _clusterInfoAccessor.getVipUrl(); + } + + @Override + public void forRunningTasks(String tableNameWithType, String taskType, + Consumer> taskConfigConsumer) { + TaskGeneratorUtils.forRunningTasks(tableNameWithType, taskType, _clusterInfoAccessor, taskConfigConsumer); + } + + @Override + public boolean tableExists(String tableNameWithType) { + return _clusterInfoAccessor.getTableConfig(tableNameWithType) != null; + } + + @Override + public TableConfig getTableConfig(String tableNameWithType) { + TableConfig tableConfig = _clusterInfoAccessor.getTableConfig(tableNameWithType); + if (tableConfig == null) { + throw new IllegalStateException("Table config not found for: " + tableNameWithType + + " (use tableExists() to probe; this method requires the table to exist)"); + } + return tableConfig; + } + + @Override + public Schema getTableSchema(String tableName) { + Schema schema = _clusterInfoAccessor.getTableSchema(tableName); + if (schema == null) { + throw new IllegalStateException("Schema not found for table: " + tableName + + " (the table may exist without a registered schema — fix the cluster state)"); + } + return schema; + } + + @Override + public String getClusterConfig(String configName) { + return _clusterInfoAccessor.getClusterConfig(configName); + } + } +} diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskProgressObserverFactory.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskProgressObserverFactory.java new file mode 100644 index 000000000000..31d636762fab --- /dev/null +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskProgressObserverFactory.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks.materializedview; + +import org.apache.pinot.minion.event.BaseMinionProgressObserverFactory; +import org.apache.pinot.spi.annotations.minion.EventObserverFactory; +import org.apache.pinot.spi.utils.CommonConstants; + + +/// Progress observer factory for [MaterializedViewTaskExecutor]. +@EventObserverFactory +public class MaterializedViewTaskProgressObserverFactory extends BaseMinionProgressObserverFactory { + + @Override + public String getTaskType() { + return CommonConstants.MaterializedViewTask.TASK_TYPE; + } +} diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/TaskRegistryTest.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/TaskRegistryTest.java index 6abbf1eea61f..ba19a25fce75 100644 --- a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/TaskRegistryTest.java +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/TaskRegistryTest.java @@ -21,6 +21,8 @@ import java.util.Set; import org.apache.pinot.controller.helix.core.minion.generator.TaskGeneratorRegistry; import org.apache.pinot.minion.executor.TaskExecutorFactoryRegistry; +import org.apache.pinot.plugin.minion.tasks.materializedview.MaterializedViewTaskExecutorFactory; +import org.apache.pinot.plugin.minion.tasks.materializedview.MaterializedViewTaskGenerator; import org.apache.pinot.plugin.minion.tasks.mergerollup.MergeRollupTaskExecutorFactory; import org.apache.pinot.plugin.minion.tasks.mergerollup.MergeRollupTaskGenerator; import org.apache.pinot.plugin.minion.tasks.purge.PurgeTaskExecutorFactory; @@ -43,6 +45,7 @@ public void testTaskGeneratorRegistry() { assertTrue(classes.contains(PurgeTaskGenerator.class)); assertTrue(classes.contains(SegmentGenerationAndPushTaskGenerator.class)); assertTrue(classes.contains(RealtimeToOfflineSegmentsTaskGenerator.class)); + assertTrue(classes.contains(MaterializedViewTaskGenerator.class)); } @Test @@ -52,5 +55,6 @@ public void testTaskExecutorRegistry() { assertTrue(classes.contains(PurgeTaskExecutorFactory.class)); assertTrue(classes.contains(SegmentGenerationAndPushTaskExecutorFactory.class)); assertTrue(classes.contains(RealtimeToOfflineSegmentsTaskExecutorFactory.class)); + assertTrue(classes.contains(MaterializedViewTaskExecutorFactory.class)); } } diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewSegmentNameTest.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewSegmentNameTest.java new file mode 100644 index 000000000000..06a7b7122d6a --- /dev/null +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewSegmentNameTest.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks.materializedview; + +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertTrue; + + +/// Verifies that MV segment names are unique across task retries for the same window. +/// +/// Bug: before the fix, names were `table_startMs_endMs_segIdx`. Two attempts of the +/// same window produced identical names. When attempt-1 uploaded segments but did not finish +/// `startReplaceSegments`, attempt-2 would regenerate the same names and the controller +/// would reject the new lineage entry because the segment names already existed. +/// +/// Fix: names now include a per-invocation `attemptId` (a fresh UUID generated at the +/// start of each `executeTask` call): `table_startMs_endMs_attemptId_segIdx`. +/// The Helix subtask id is NOT used here because Helix reuses the same subtask id across retries +/// of a job, which would reproduce identical names and trigger the same collision. +public class MaterializedViewSegmentNameTest { + + private static final String TABLE = "orders_mv_OFFLINE"; + private static final long START_MS = 1_700_000_000_000L; + private static final long END_MS = 1_700_086_400_000L; + + @Test + public void testSegmentNameIncludesAttemptId() { + String attemptId = "550e8400-e29b-41d4-a716-446655440000"; + String name = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, attemptId, 0); + assertTrue(name.contains(attemptId), + "Segment name must contain attemptId to be unique across retries: " + name); + } + + @Test + public void testDifferentAttemptIdsProduceDifferentNames() { + String name1 = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, "uuid-attempt-1", 0); + String name2 = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, "uuid-attempt-2", 0); + assertNotEquals(name1, name2, + "Same window, same segIdx but different attemptIds must produce different names"); + } + + @Test + public void testSameAttemptIdSameWindowProducesSameName() { + String attemptId = "550e8400-e29b-41d4-a716-446655440000"; + String name1 = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, attemptId, 0); + String name2 = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, attemptId, 0); + assertEquals(name1, name2, + "Same attemptId and same segIdx must produce stable names within one attempt"); + } + + @Test + public void testDifferentSegIdxProduceDifferentNames() { + String attemptId = "550e8400-e29b-41d4-a716-446655440000"; + String name0 = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, attemptId, 0); + String name1 = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, attemptId, 1); + assertNotEquals(name0, name1, + "Different segIdx values must produce different names"); + } + + @Test + public void testNameStartsWithWindowPrefix() { + String attemptId = "550e8400-e29b-41d4-a716-446655440000"; + String name = MaterializedViewTaskExecutor.buildSegmentName(TABLE, START_MS, END_MS, attemptId, 0); + String expectedPrefix = TABLE + "_" + START_MS + "_" + END_MS; + assertTrue(name.startsWith(expectedPrefix), + "Segment name must start with table_startMs_endMs so the window prefix scan matches it: " + name); + } +} diff --git a/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorTest.java b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorTest.java new file mode 100644 index 000000000000..01336b89f4a1 --- /dev/null +++ b/pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/materializedview/MaterializedViewTaskExecutorTest.java @@ -0,0 +1,283 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks.materializedview; + +import java.util.Base64; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.pinot.common.metadata.segment.SegmentZKMetadata; +import org.apache.pinot.materializedview.metadata.PartitionFingerprint; +import org.apache.pinot.materializedview.metadata.PartitionInfo; +import org.apache.pinot.materializedview.metadata.PartitionState; +import org.apache.pinot.spi.utils.CommonConstants.MaterializedViewTask; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + + +/// Unit tests for [MaterializedViewTaskExecutor] helpers. +/// +/// This is the single-most-important correctness gate in the MV executor: if the query result +/// saturates the declared LIMIT, we MUST fail the task so the runtime watermark / partitions map +/// is not advanced against incomplete data. +public class MaterializedViewTaskExecutorTest { + + private static final String TABLE = "mv_orders"; + private static final long WINDOW_START = 1_700_000_000_000L; + private static final long WINDOW_END = 1_700_086_400_000L; + + @Test + public void testUnderLimitPasses() { + Map configs = configsWithLimit(1_000); + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 999); + } + + @Test + public void testAtLimitFails() { + Map configs = configsWithLimit(1_000); + try { + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 1_000); + fail("Expected completeness gate to fail when rows == LIMIT"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("saturated LIMIT"), + "Unexpected message: " + e.getMessage()); + assertTrue(e.getMessage().contains(TABLE)); + } + } + + @Test + public void testOverLimitFails() { + // Defensive: the broker should cap at LIMIT, but an ill-behaved executor could return more. + Map configs = configsWithLimit(1_000); + try { + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 1_500); + fail("Expected completeness gate to fail when rows > LIMIT"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("saturated LIMIT"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testMissingLimitKeyFailsLoudly() { + // Pre-upgrade tasks lacking EFFECTIVE_LIMIT_KEY must NOT be silently passed through — + // a missing key would let the broker's small default-LIMIT truncate the MV window + // without the saturation gate detecting it. Helix will retry; the retry sees a new + // task config (post-upgrade) and succeeds. + Map configs = new HashMap<>(); + try { + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 42); + fail("Expected IllegalStateException for missing EFFECTIVE_LIMIT_KEY"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("Missing"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testInvalidLimitKeyThrows() { + Map configs = new HashMap<>(); + configs.put(MaterializedViewTask.EFFECTIVE_LIMIT_KEY, "not-a-number"); + try { + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 42); + fail("Expected IllegalStateException for malformed effectiveLimit"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("Invalid"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testZeroLimitFailsLoudly() { + // The generator always emits a positive effectiveLimit (user-declared or DEFAULT_MV_QUERY_LIMIT). + // A 0 here means corrupted task config — fail loud so the bug surfaces. + Map configs = configsWithLimit(0); + try { + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 0); + fail("Expected IllegalStateException for non-positive effectiveLimit"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("must be positive"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testNegativeLimitFailsLoudly() { + Map configs = configsWithLimit(-1); + try { + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 0); + fail("Expected IllegalStateException for negative effectiveLimit"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("must be positive"), + "Unexpected message: " + e.getMessage()); + } + } + + @Test + public void testZeroRowsPasses() { + // Empty windows are legitimate and must not be flagged as truncated. + Map configs = configsWithLimit(1_000); + MaterializedViewTaskExecutor.verifyResultNotTruncated( + configs, TABLE, WINDOW_START, WINDOW_END, 0); + } + + private static Map configsWithLimit(int limit) { + Map configs = new HashMap<>(); + configs.put(MaterializedViewTask.EFFECTIVE_LIMIT_KEY, String.valueOf(limit)); + return configs; + } + + // ----------------------------------------------------------------------- + // decodeBytesValue + // ----------------------------------------------------------------------- + + @Test + public void testDecodeBytesValueFromHex() { + assertEquals((byte[]) MaterializedViewTaskExecutor.decodeBytesValue("raw_hll", "01020a0f"), + new byte[]{1, 2, 10, 15}); + } + + @Test + public void testDecodeBytesValueFromBase64() { + String value = Base64.getEncoder().encodeToString(new byte[]{1, 2, 10, 15}); + assertEquals((byte[]) MaterializedViewTaskExecutor.decodeBytesValue("raw_theta", value), + new byte[]{1, 2, 10, 15}); + } + + // ----------------------------------------------------------------------- + // computeContiguousUpperMs + // ----------------------------------------------------------------------- + + private static final long BUCKET_MS = 86_400_000L; // 1d + + @Test + public void testContiguousEmptyMap() { + long result = MaterializedViewTaskExecutor.computeContiguousUpperMs( + WINDOW_START, new LinkedHashMap<>(), BUCKET_MS); + assertEquals(result, WINDOW_START, "Empty map: cursor unchanged"); + } + + @Test + public void testContiguousSingleValid() { + Map partitions = new LinkedHashMap<>(); + partitions.put(WINDOW_START, validInfo()); + long result = MaterializedViewTaskExecutor.computeContiguousUpperMs( + WINDOW_START, partitions, BUCKET_MS); + assertEquals(result, WINDOW_START + BUCKET_MS, "Single VALID: advances by one bucket"); + } + + @Test + public void testContiguousChainOfThree() { + Map partitions = new LinkedHashMap<>(); + partitions.put(WINDOW_START, validInfo()); + partitions.put(WINDOW_START + BUCKET_MS, validInfo()); + partitions.put(WINDOW_START + 2 * BUCKET_MS, validInfo()); + long result = MaterializedViewTaskExecutor.computeContiguousUpperMs( + WINDOW_START, partitions, BUCKET_MS); + assertEquals(result, WINDOW_START + 3 * BUCKET_MS, "Three VALIDs: advances three buckets"); + } + + @Test + public void testContiguousStopsAtGap() { + // [START] VALID, [START+1d] missing, [START+2d] VALID — chain stops at the gap. + Map partitions = new LinkedHashMap<>(); + partitions.put(WINDOW_START, validInfo()); + partitions.put(WINDOW_START + 2 * BUCKET_MS, validInfo()); + long result = MaterializedViewTaskExecutor.computeContiguousUpperMs( + WINDOW_START, partitions, BUCKET_MS); + assertEquals(result, WINDOW_START + BUCKET_MS, "Gap after first VALID: stops there"); + } + + @Test + public void testContiguousStopsAtNonValidState() { + Map partitions = new LinkedHashMap<>(); + partitions.put(WINDOW_START, validInfo()); + partitions.put(WINDOW_START + BUCKET_MS, staleInfo()); + long result = MaterializedViewTaskExecutor.computeContiguousUpperMs( + WINDOW_START, partitions, BUCKET_MS); + assertEquals(result, WINDOW_START + BUCKET_MS, "STALE breaks the chain like a gap"); + } + + @Test + public void testContiguousFromMsAlreadyMissing() { + // Cursor starts at a missing key — return immediately without advancing. + Map partitions = new LinkedHashMap<>(); + partitions.put(WINDOW_START + BUCKET_MS, validInfo()); + long result = MaterializedViewTaskExecutor.computeContiguousUpperMs( + WINDOW_START, partitions, BUCKET_MS); + assertEquals(result, WINDOW_START, "fromMs not present: no advance"); + } + + @Test + public void testWindowFingerprintIncludesSegmentIdentity() { + SegmentZKMetadata first = segment("segA", WINDOW_START, WINDOW_START + BUCKET_MS, 1234L); + SegmentZKMetadata second = segment("segB", WINDOW_START, WINDOW_START + BUCKET_MS, 1234L); + + PartitionFingerprint firstFingerprint = + MaterializedViewTaskExecutor.computeWindowFingerprint(List.of(first), WINDOW_START, WINDOW_START + BUCKET_MS); + PartitionFingerprint secondFingerprint = + MaterializedViewTaskExecutor.computeWindowFingerprint(List.of(second), WINDOW_START, WINDOW_START + BUCKET_MS); + + assertTrue(!firstFingerprint.equals(secondFingerprint), + "Fingerprint must change when segment identity changes even if CRC is reused"); + } + + @Test + public void testWindowFingerprintOnlyCountsOverlappingSegments() { + SegmentZKMetadata overlapping = segment("overlap", WINDOW_START, WINDOW_START + BUCKET_MS, 10L); + SegmentZKMetadata outside = segment("outside", WINDOW_START + 2 * BUCKET_MS, WINDOW_START + 3 * BUCKET_MS, 20L); + + PartitionFingerprint fingerprint = MaterializedViewTaskExecutor.computeWindowFingerprint( + List.of(overlapping, outside), WINDOW_START, WINDOW_START + BUCKET_MS); + + assertEquals(fingerprint.getSegmentCount(), 1); + assertEquals(fingerprint, + MaterializedViewTaskExecutor.computeWindowFingerprint(List.of(overlapping), WINDOW_START, + WINDOW_START + BUCKET_MS)); + } + + private static PartitionInfo validInfo() { + return new PartitionInfo(PartitionState.VALID, new PartitionFingerprint(0, 0), 0L); + } + + private static PartitionInfo staleInfo() { + return new PartitionInfo(PartitionState.STALE, new PartitionFingerprint(0, 0), 0L); + } + + private static SegmentZKMetadata segment(String name, long startMs, long endMs, long crc) { + SegmentZKMetadata segmentZKMetadata = new SegmentZKMetadata(name); + segmentZKMetadata.setTimeUnit(TimeUnit.MILLISECONDS); + segmentZKMetadata.setStartTime(startMs); + segmentZKMetadata.setEndTime(endMs); + segmentZKMetadata.setCrc(crc); + return segmentZKMetadata; + } +} diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java index fad1c44f4af4..ac64394aad02 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java @@ -1706,6 +1706,141 @@ public static class Minion { public static final String DEFAULT_ALLOW_DOWNLOAD_FROM_SERVER = "false"; } + /** + * Materializes pre-aggregated data into an OFFLINE table based on a user-defined SQL query. + * The generator computes a time window and appends it to the SQL; the executor queries the + * base table via the broker, builds segments from the results, and uploads them to the MV + * table. + * + *

Supports three task modes: {@code APPEND} (new time windows), {@code OVERWRITE} + * (re-materialize stale partitions), and {@code DELETE} (remove expired partitions). + * + *

User-facing config keys: {@code definedSQL}, {@code bucketTimePeriod}, + * {@code bufferTimePeriod} (optional), {@code maxNumRecordsPerSegment} (optional, default + * {@link #DEFAULT_MAX_NUM_RECORDS_PER_SEGMENT}). + */ + public static class MaterializedViewTask { + public static final String TASK_TYPE = "MaterializedViewTask"; + + /** + * Prefix for the gRPC client config the minion uses to query the broker when materializing + * the MV. Set keys under this prefix (e.g. {@code pinot.minion.materializedview.broker.grpc. + * usePlainText=false}, {@code .tls.keystore.path=...}) to enable TLS, raise the max inbound + * message size, or tune keepalive. Without these, the gRPC client connects in plaintext with + * defaults — fine for local quickstarts but wrong for any TLS-enabled production cluster. + * + *

Note: per-request auth metadata (Bearer tokens, etc.) is unaffected by this prefix; it + * is sourced per task from the task's {@code AuthProvider} and forwarded as gRPC metadata. + */ + public static final String MINION_BROKER_GRPC_CONFIG_PREFIX = "pinot.minion.materializedview.broker.grpc"; + + public static final String DEFINED_SQL_KEY = "definedSQL"; + public static final String BUCKET_TIME_PERIOD_KEY = "bucketTimePeriod"; + public static final String BUFFER_TIME_PERIOD_KEY = "bufferTimePeriod"; + public static final String MAX_NUM_RECORDS_PER_SEGMENT_KEY = "maxNumRecordsPerSegment"; + + public static final String WINDOW_START_MS_KEY = "windowStartMs"; + public static final String WINDOW_END_MS_KEY = "windowEndMs"; + public static final String SOURCE_TABLE_NAME_KEY = "sourceTableName"; + public static final String PARTITION_FINGERPRINTS_KEY = "partitionFingerprints"; + + /** + * Generator-populated copy of the user's declared {@code LIMIT} value from {@code definedSQL}. + * Passed through to the executor so it can detect result-set truncation (when the query + * actually returned {@code LIMIT}-many rows, the window is almost certainly incomplete and + * must not be marked VALID / advance the runtime watermark). + */ + public static final String EFFECTIVE_LIMIT_KEY = "effectiveLimit"; + + public static final String TASK_MODE_KEY = "taskMode"; + public static final String TASK_MODE_APPEND = "APPEND"; + public static final String TASK_MODE_OVERWRITE = "OVERWRITE"; + public static final String TASK_MODE_DELETE = "DELETE"; + + public static final int DEFAULT_MAX_NUM_RECORDS_PER_SEGMENT = 5_000_000; + + /** + * Maximum number of APPEND task windows to schedule in a single generator cycle. + * Increase this to back-fill historical data faster. Default 4 lets a typical onboarding + * back-fill complete in roughly {@code N/4} scheduling cycles instead of {@code N} for a + * single-task-per-cycle setup, while keeping minion-pool contention bounded. + */ + public static final String MAX_TASKS_PER_BATCH_KEY = "maxTasksPerBatch"; + public static final int DEFAULT_MAX_TASKS_PER_BATCH = 4; + + /** + * Per-MV staleness SLO. Broker excludes the MV from rewrite when + * {@code (now - watermarkMs) > stalenessThresholdMs}, falling back to the base table. + * Operators set this to bound the maximum age of MV-served data. Default {@code 0} means + * "no SLO check" (broker uses any MV with a non-zero watermark). + */ + public static final String STALENESS_THRESHOLD_MS_KEY = "stalenessThresholdMs"; + public static final long DEFAULT_STALENESS_THRESHOLD_MS = 0L; + + /** + * Hard upper bound on the user-facing {@code maxTasksPerBatch} config - values above this + * are rejected at table-create time. Distinct from the internal scheduler-loop iteration + * cap (which can be larger because it covers historical-VALID skip work, not just slot + * count). + */ + public static final int MAX_TASKS_PER_BATCH_USER_CAP = 1_000; + + /** + * Auto-injected {@code LIMIT} value used when {@code definedSQL} omits an explicit LIMIT. + * + *

Without this, the broker would silently apply its cluster-wide default query limit + * (see {@code pinot.broker.default.query.limit}, default 10) to MV-generation queries and + * truncate every window to that many rows - the executor's saturation gate cannot detect + * such truncation because it never sees the broker's silent override. + */ + public static final int DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT = 1_000_000; + + /** + * Hard upper bound on any user-declared LIMIT in {@code definedSQL}. Capped at + * {@code 100_000_000} so a single window cannot OOM the executor - the executor must + * accumulate all returned rows in memory before the saturation gate can detect truncation. + * Operators with legitimately larger windows must split via narrower {@code bucketTimePeriod} + * or filters in {@code definedSQL}. + */ + public static final int MAX_MATERIALIZED_VIEW_QUERY_LIMIT = 100_000_000; + + // ------------------------------------------------------------------------- + // Cluster-config keys that override the compile-time defaults above. + // + // All keys are read live from Helix CLUSTER scope on each consumer-site + // call — no controller / minion restart is required for a value change + // to take effect. When a key is unset, malformed, or non-positive, the + // compile-time default applies. + // + // Use `pinot-admin.sh ClusterConfig` or the controller REST endpoint + // /cluster/configs to set / update / unset these. + // ------------------------------------------------------------------------- + + /// Cluster-config key. Overrides {@link #DEFAULT_MATERIALIZED_VIEW_QUERY_LIMIT}. + public static final String CLUSTER_CONFIG_KEY_DEFAULT_QUERY_LIMIT = + "pinot.materialized.view.query.default.limit"; + + /// Cluster-config key. Overrides {@link #MAX_MATERIALIZED_VIEW_QUERY_LIMIT}. + public static final String CLUSTER_CONFIG_KEY_MAX_QUERY_LIMIT = + "pinot.materialized.view.query.max.limit"; + + /// Cluster-config key. Overrides {@link #MAX_TASKS_PER_BATCH_USER_CAP}. + public static final String CLUSTER_CONFIG_KEY_MAX_TASKS_PER_BATCH_CAP = + "pinot.materialized.view.scheduler.max.tasks.per.batch.cap"; + + /// Cluster-config key. Overrides the scheduler's internal batch-loop iteration cap. + public static final String CLUSTER_CONFIG_KEY_MAX_BATCH_LOOP_ITERATIONS = + "pinot.materialized.view.scheduler.max.batch.loop.iterations"; + + /// Cluster-config key. Overrides the executor's runtime-znode CAS retry budget. + public static final String CLUSTER_CONFIG_KEY_MAX_RUNTIME_UPDATE_ATTEMPTS = + "pinot.materialized.view.executor.runtime.update.max.attempts"; + + /// Cluster-config key. Overrides the consistency manager's debounce window (ms). + public static final String CLUSTER_CONFIG_KEY_CONSISTENCY_DEBOUNCE_MS = + "pinot.materialized.view.consistency.debounce.ms"; + } + public static class ControllerJob { /** * Controller job ZK props diff --git a/pinot-tools/src/main/java/org/apache/pinot/tools/MaterializedViewQuickStart.java b/pinot-tools/src/main/java/org/apache/pinot/tools/MaterializedViewQuickStart.java new file mode 100644 index 000000000000..c367bd26edd5 --- /dev/null +++ b/pinot-tools/src/main/java/org/apache/pinot/tools/MaterializedViewQuickStart.java @@ -0,0 +1,389 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.tools; + +import com.fasterxml.jackson.databind.JsonNode; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.pinot.common.minion.MinionClient; +import org.apache.pinot.spi.utils.CommonConstants; +import org.apache.pinot.tools.admin.PinotAdministrator; +import org.apache.pinot.tools.admin.command.QuickstartRunner; + + +/** + * Quickstart that demonstrates Materialized View (MV) creation and ingestion. + * + *

This quickstart: + *

    + *
  1. Loads the {@code airlineStats} base table (31 days of flight data, Jan 2014).
  2. + *
  3. Creates an empty {@code airlineStatsMv} table configured with a + * {@code MaterializedViewTask} that pre-aggregates daily carrier metrics.
  4. + *
  5. Triggers the minion task to materialize the MV and waits for completion.
  6. + *
  7. Runs each aggregation against both the base table and the MV table, prints both + * result sets, and asserts they agree (the MV is correct iff the re-aggregation over + * the MV's per-day-per-carrier rows reproduces the base table's answer).
  8. + *
+ * + *

The MV definition is: + *

+ *   SELECT DaysSinceEpoch, Carrier,
+ *          SUM(ArrDelay) AS sum_ArrDelay,
+ *          COUNT(*) AS flight_count,
+ *          MIN(ArrDelay) AS min_ArrDelay,
+ *          MAX(ArrDelay) AS max_ArrDelay,
+ *          DISTINCTCOUNTRAWHLL(FlightNum) AS raw_hll_FlightNum,
+ *          DISTINCTCOUNTRAWHLLPLUS(FlightNum) AS raw_hllplus_FlightNum
+ *   FROM airlineStats
+ *   GROUP BY DaysSinceEpoch, Carrier
+ * 
+ * + *

Broker query rewrite (so callers don't have to know about the MV) lands in a follow-up + * PR; until then, callers query the MV table directly. The comparison step in this quickstart + * uses the same re-aggregation pattern the rewrite engine will use: + * {@code SUM} over {@code sum_ArrDelay}, {@code SUM} over {@code flight_count}, + * {@code MIN}/{@code MAX} over their stored mins/maxes, and {@code DISTINCTCOUNTHLL} / + * {@code DISTINCTCOUNTHLLPLUS} applied directly to the raw-sketch columns (the sketch is + * deserialized and merged in-place). + * + *

The example table config sets {@code maxTasksPerBatch=31} to backfill all 31 days of + * the airlineStats fixture in a single scheduling cycle. Production deployments typically + * leave the default of 1; raise it only when intentionally back-filling and after sizing + * the minion pool to absorb the resulting concurrent task load. + * + *

Run via: {@code bin/pinot-admin.sh QuickStart -type MATERIALIZED_VIEW} + */ +public class MaterializedViewQuickStart extends Quickstart { + + private static final String BASE_TABLE = "airlineStats"; + private static final String MATERIALIZED_VIEW_TABLE = "airlineStatsMv"; + private static final int FIXTURE_COVERAGE_UPPER_DAY = 16102; + private static final long TASK_POLL_INTERVAL_MS = 5_000L; + private static final long TASK_TIMEOUT_MS = 300_000L; + + @Override + public List types() { + return Arrays.asList("MATERIALIZED_VIEW", "MATERIALIZED-VIEW", "BATCH_MV"); + } + + @Override + public Map getConfigOverrides() { + Map overrides = new HashMap<>(super.getConfigOverrides()); + overrides.putIfAbsent("controller.task.scheduler.enabled", true); + return overrides; + } + + @Override + protected String[] getDefaultBatchTableDirectories() { + return new String[]{ + "examples/batch/airlineStats", + "examples/batch/airlineStatsMv" + }; + } + + @Override + protected String getValidationTypesToSkip() { + return "TASK"; + } + + @Override + public void runSampleQueries(QuickstartRunner runner) + throws Exception { + printStatus(Color.CYAN, "***** Step 1: Verify airlineStats base table is loaded *****"); + + runQuery(runner, "Count all flights in airlineStats", + "SELECT COUNT(*) FROM " + BASE_TABLE + " LIMIT 1"); + + runQuery(runner, "Top 10 carriers by total arrival delay (direct base table query)", + "SELECT Carrier, SUM(ArrDelay) AS total_delay, COUNT(*) AS flights " + + "FROM " + BASE_TABLE + " GROUP BY Carrier ORDER BY total_delay DESC LIMIT 10"); + + MinionClient minionClient = new MinionClient( + "http://localhost:" + QuickstartRunner.DEFAULT_CONTROLLER_PORT, null); + + printStatus(Color.CYAN, "***** Step 2: Trigger MaterializedViewTask to generate MV segments *****"); + printStatus(Color.GREEN, + "airlineStatsMv stores SUM, COUNT, MIN, MAX and raw HLL/HLLPlus sketches by day and carrier."); + triggerMaterializedViewTask(minionClient); + + printStatus(Color.CYAN, "***** Step 3: Wait for MV segments to be generated and served *****"); + waitForMaterializedViewSegments(runner, minionClient); + + printStatus(Color.CYAN, "***** Step 4: Verify base-table vs MV-table results match *****"); + printStatus(Color.GREEN, + "For each aggregation, the same logical answer is computed two ways: directly from the base, and by " + + "re-aggregating the pre-computed values stored in the MV. Mismatches indicate an MV ingestion bug."); + + String windowFilter = " WHERE DaysSinceEpoch < " + FIXTURE_COVERAGE_UPPER_DAY + " "; + + runComparison(runner, "SUM and COUNT: top 10 carriers by total arrival delay", + "SELECT Carrier, SUM(ArrDelay) AS total_delay, COUNT(*) AS flights " + + "FROM " + BASE_TABLE + windowFilter + + "GROUP BY Carrier ORDER BY Carrier LIMIT 100", + "SELECT Carrier, SUM(sum_ArrDelay) AS total_delay, SUM(flight_count) AS flights " + + "FROM " + MATERIALIZED_VIEW_TABLE + " GROUP BY Carrier ORDER BY Carrier LIMIT 100"); + + // Time-filtered carrier comparison. Both sides apply a day-window predicate on their native + // time column (DaysSinceEpoch on the base, tsMs=DaysSinceEpoch*86400000 on the MV) and + // collapse to per-Carrier totals so the row shapes match without TIMESTAMP-vs-LONG wire + // serialization differences. Exercises the time predicate on the MV side; the unfiltered + // queries above already cover full-table re-aggregation correctness. + long fromTsMs = 16071L * 86400000L; + long toTsMs = 16080L * 86400000L; + runComparison(runner, "SUM (time-filtered): carrier totals over first 10 days of Jan 2014", + "SELECT Carrier, SUM(ArrDelay) AS total_delay, COUNT(*) AS flights " + + "FROM " + BASE_TABLE + + " WHERE DaysSinceEpoch BETWEEN 16071 AND 16080 " + + "GROUP BY Carrier ORDER BY Carrier LIMIT 100", + "SELECT Carrier, SUM(sum_ArrDelay) AS total_delay, SUM(flight_count) AS flights " + + "FROM " + MATERIALIZED_VIEW_TABLE + + " WHERE tsMs BETWEEN " + fromTsMs + " AND " + toTsMs + " " + + "GROUP BY Carrier ORDER BY Carrier LIMIT 100"); + + runComparison(runner, "MIN and MAX: arrival delay range by carrier", + "SELECT Carrier, MIN(ArrDelay) AS min_delay, MAX(ArrDelay) AS max_delay " + + "FROM " + BASE_TABLE + windowFilter + + "GROUP BY Carrier ORDER BY Carrier LIMIT 100", + "SELECT Carrier, MIN(min_ArrDelay) AS min_delay, MAX(max_ArrDelay) AS max_delay " + + "FROM " + MATERIALIZED_VIEW_TABLE + " GROUP BY Carrier ORDER BY Carrier LIMIT 100"); + + // HLL sketches: DISTINCTCOUNTHLL applied to the raw-sketch column merges sketches before + // returning the cardinality estimate. Identical sketch parameters in both queries are + // required for byte-identical results — the MV stored DISTINCTCOUNTRAWHLL with defaults, + // so we query DISTINCTCOUNTHLL with defaults on both sides. + runComparison(runner, "DISTINCTCOUNTHLL: approximate distinct flight numbers by carrier", + "SELECT Carrier, DISTINCTCOUNTHLL(FlightNum) AS approx_flight_nums " + + "FROM " + BASE_TABLE + windowFilter + + "GROUP BY Carrier ORDER BY Carrier LIMIT 100", + "SELECT Carrier, DISTINCTCOUNTHLL(raw_hll_FlightNum) AS approx_flight_nums " + + "FROM " + MATERIALIZED_VIEW_TABLE + " GROUP BY Carrier ORDER BY Carrier LIMIT 100"); + + runComparison(runner, "DISTINCTCOUNTHLLPLUS: approximate distinct flight numbers by carrier", + "SELECT Carrier, DISTINCTCOUNTHLLPLUS(FlightNum) AS approx_flight_nums_hllplus " + + "FROM " + BASE_TABLE + windowFilter + + "GROUP BY Carrier ORDER BY Carrier LIMIT 100", + "SELECT Carrier, DISTINCTCOUNTHLLPLUS(raw_hllplus_FlightNum) AS approx_flight_nums_hllplus " + + "FROM " + MATERIALIZED_VIEW_TABLE + " GROUP BY Carrier ORDER BY Carrier LIMIT 100"); + + printStatus(Color.GREEN, String.format( + "You can always go to http://localhost:%d to play around in the query console", + QuickstartRunner.DEFAULT_CONTROLLER_PORT)); + printStatus(Color.GREEN, + "Try: SELECT Carrier, MIN(min_ArrDelay), MAX(max_ArrDelay) " + + "FROM airlineStatsMv GROUP BY Carrier"); + } + + private JsonNode runQuery(QuickstartRunner runner, String description, String query) + throws Exception { + printStatus(Color.YELLOW, description); + printStatus(Color.CYAN, "Query : " + query); + JsonNode response = runner.runQuery(query); + printStatus(Color.YELLOW, prettyPrintResponse(response)); + printStatus(Color.GREEN, "***************************************************"); + return response; + } + + /// Runs the same logical aggregation against the base table and the MV table, prints both + /// result sets, and reports whether the row data matches. Both queries must produce results + /// in the same column order (the comparison is positional within each row); the caller is + /// responsible for crafting them to be apples-to-apples. + private void runComparison(QuickstartRunner runner, String description, String baseQuery, String mvQuery) + throws Exception { + printStatus(Color.YELLOW, description + " — base table"); + printStatus(Color.CYAN, "Base : " + baseQuery); + JsonNode baseResponse = runner.runQuery(baseQuery); + printStatus(Color.YELLOW, prettyPrintResponse(baseResponse)); + + printStatus(Color.YELLOW, description + " — MV table"); + printStatus(Color.CYAN, "MV : " + mvQuery); + JsonNode mvResponse = runner.runQuery(mvQuery); + printStatus(Color.YELLOW, prettyPrintResponse(mvResponse)); + + String mismatch = compareResultRows(baseResponse, mvResponse); + if (mismatch == null) { + printStatus(Color.GREEN, "*** Base and MV results MATCH ***"); + } else { + printStatus(Color.YELLOW, "WARNING: base and MV results DIFFER — " + mismatch); + } + printStatus(Color.GREEN, "***************************************************"); + } + + /// Compares the `rows` arrays of two Pinot query responses positionally with + /// type-tolerant cell equality: numeric cells (including DOUBLE/LONG/INT and + /// JSON-stringified numbers) are compared as doubles so a base-side INT `COUNT` + /// matches an MV-side DOUBLE `SUM(flight_count)`; non-numeric cells fall back to + /// string equality. + /// + /// Returns `null` when the rows agree, or a short diagnostic string when they do not. + private static String compareResultRows(JsonNode baseResponse, JsonNode mvResponse) { + if (baseResponse == null || mvResponse == null) { + return "one of the responses was null"; + } + if (responseHasException(baseResponse)) { + return "base query produced exceptions"; + } + if (responseHasException(mvResponse)) { + return "MV query produced exceptions"; + } + JsonNode baseRows = baseResponse.path("resultTable").path("rows"); + JsonNode mvRows = mvResponse.path("resultTable").path("rows"); + if (!baseRows.isArray() || !mvRows.isArray()) { + return "missing resultTable.rows array on at least one side"; + } + if (baseRows.size() != mvRows.size()) { + return "row count differs: base=" + baseRows.size() + ", mv=" + mvRows.size(); + } + for (int r = 0; r < baseRows.size(); r++) { + JsonNode baseRow = baseRows.get(r); + JsonNode mvRow = mvRows.get(r); + if (baseRow.size() != mvRow.size()) { + return "row " + r + " column count differs: base=" + baseRow.size() + ", mv=" + mvRow.size(); + } + for (int c = 0; c < baseRow.size(); c++) { + if (!cellEquals(baseRow.get(c), mvRow.get(c))) { + return "row " + r + " column " + c + " differs: base=" + baseRow.get(c).asText() + + ", mv=" + mvRow.get(c).asText(); + } + } + } + return null; + } + + /// Cell-level equality with type tolerance. Treats `COUNT(*)`-as-INT and + /// `SUM(flight_count)`-as-DOUBLE as equal when their numeric values agree, and falls + /// back to string compare otherwise. + private static boolean cellEquals(JsonNode base, JsonNode mv) { + String baseText = base.asText(); + String mvText = mv.asText(); + if (baseText.equals(mvText)) { + return true; + } + Double baseNum = tryParseDouble(baseText); + Double mvNum = tryParseDouble(mvText); + if (baseNum != null && mvNum != null) { + return baseNum.doubleValue() == mvNum.doubleValue(); + } + return false; + } + + private static Double tryParseDouble(String s) { + if (s == null || s.isEmpty()) { + return null; + } + try { + return Double.parseDouble(s); + } catch (NumberFormatException e) { + return null; + } + } + + private static boolean responseHasException(JsonNode response) { + JsonNode exceptions = response.path("exceptions"); + return exceptions.isArray() && exceptions.size() > 0; + } + + private void triggerMaterializedViewTask(MinionClient minionClient) { + try { + Map scheduled = minionClient.scheduleMinionTasks( + CommonConstants.MaterializedViewTask.TASK_TYPE, + MATERIALIZED_VIEW_TABLE + "_OFFLINE"); + if (scheduled.isEmpty()) { + printStatus(Color.YELLOW, + "No tasks scheduled — MV may already be up-to-date or minion is still starting up"); + } else { + printStatus(Color.GREEN, "Scheduled MV tasks: " + scheduled); + } + } catch (Exception e) { + printStatus(Color.YELLOW, "Could not schedule MV task (will retry): " + e.getMessage()); + } + } + + private void waitForMaterializedViewSegments(QuickstartRunner runner, MinionClient minionClient) + throws Exception { + long expectedRows = getExpectedMaterializedViewRowCount(runner); + if (expectedRows > 0) { + printStatus(Color.CYAN, + "Waiting up to 5 minutes for all " + expectedRows + " MV pre-aggregated rows to be generated..."); + } else { + printStatus(Color.CYAN, "Waiting up to 5 minutes for MV segments to be generated..."); + } + long deadline = System.currentTimeMillis() + TASK_TIMEOUT_MS; + while (System.currentTimeMillis() < deadline) { + try { + JsonNode result = runner.runQuery("SELECT COUNT(*) FROM " + MATERIALIZED_VIEW_TABLE + " LIMIT 1"); + JsonNode rows = result.path("resultTable").path("rows"); + if (rows.isArray() && rows.size() > 0) { + long count = rows.get(0).get(0).asLong(); + if (expectedRows > 0) { + if (count >= expectedRows) { + printStatus(Color.GREEN, + "MV table " + MATERIALIZED_VIEW_TABLE + " is ready with " + count + " pre-aggregated rows."); + return; + } + printStatus(Color.CYAN, + "MV table " + MATERIALIZED_VIEW_TABLE + " has " + count + " of " + expectedRows + + " pre-aggregated rows, retrying..."); + } else if (count > 0) { + printStatus(Color.GREEN, + "MV table " + MATERIALIZED_VIEW_TABLE + " is ready with " + count + " pre-aggregated rows."); + return; + } + } + } catch (Exception e) { + printStatus(Color.YELLOW, "MV not ready yet (" + e.getMessage() + "), retrying..."); + } + printStatus(Color.CYAN, + "MV not ready yet, retrying in " + (TASK_POLL_INTERVAL_MS / 1000) + "s..."); + + // Re-trigger in case the scheduler hasn't picked it up yet + triggerMaterializedViewTask(minionClient); + + Thread.sleep(TASK_POLL_INTERVAL_MS); + } + printStatus(Color.YELLOW, + "Timed out waiting for MV segments. Comparison step will likely show mismatches."); + } + + private long getExpectedMaterializedViewRowCount(QuickstartRunner runner) { + try { + JsonNode result = runner.runQuery("SELECT DaysSinceEpoch, Carrier, COUNT(*) " + + "FROM " + BASE_TABLE + " GROUP BY DaysSinceEpoch, Carrier LIMIT 10000"); + JsonNode rows = result.path("resultTable").path("rows"); + if (rows.isArray()) { + return rows.size(); + } + } catch (Exception e) { + printStatus(Color.YELLOW, + "Could not compute expected MV row count from base table (" + e.getMessage() + + "); falling back to first served MV segment."); + } + return -1; + } + + public static void main(String[] args) + throws Exception { + List arguments = new ArrayList<>(); + arguments.addAll(Arrays.asList("QuickStart", "-type", "MATERIALIZED_VIEW")); + arguments.addAll(Arrays.asList(args)); + PinotAdministrator.main(arguments.toArray(new String[arguments.size()])); + } +} diff --git a/pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_offline_table_config.json b/pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_offline_table_config.json new file mode 100644 index 000000000000..26682fe8903c --- /dev/null +++ b/pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_offline_table_config.json @@ -0,0 +1,26 @@ +{ + "tableName": "airlineStatsMv", + "tableType": "OFFLINE", + "segmentsConfig": { + "timeColumnName": "tsMs", + "timeType": "MILLISECONDS", + "segmentPushType": "APPEND", + "replication": "1" + }, + "tenants": {}, + "tableIndexConfig": { + "loadMode": "MMAP" + }, + "metadata": { + "customConfigs": {} + }, + "task": { + "taskTypeConfigsMap": { + "MaterializedViewTask": { + "definedSQL": "SELECT DaysSinceEpoch * 86400000 AS tsMs, Carrier, SUM(ArrDelay) AS sum_ArrDelay, COUNT(*) AS flight_count, MIN(ArrDelay) AS min_ArrDelay, MAX(ArrDelay) AS max_ArrDelay, DISTINCTCOUNTRAWHLL(FlightNum) AS raw_hll_FlightNum, DISTINCTCOUNTRAWHLLPLUS(FlightNum) AS raw_hllplus_FlightNum FROM airlineStats GROUP BY DaysSinceEpoch * 86400000, Carrier", + "bucketTimePeriod": "1d", + "maxTasksPerBatch": "31" + } + } + } +} diff --git a/pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_schema.json b/pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_schema.json new file mode 100644 index 000000000000..3ca08507030b --- /dev/null +++ b/pinot-tools/src/main/resources/examples/batch/airlineStatsMv/airlineStatsMv_schema.json @@ -0,0 +1,47 @@ +{ + "schemaName": "airlineStatsMv", + "dimensionFieldSpecs": [ + { + "dataType": "STRING", + "name": "Carrier" + } + ], + "metricFieldSpecs": [ + { + "dataType": "LONG", + "defaultNullValue": 0, + "name": "sum_ArrDelay" + }, + { + "dataType": "LONG", + "defaultNullValue": 0, + "name": "flight_count" + }, + { + "dataType": "INT", + "name": "min_ArrDelay" + }, + { + "dataType": "INT", + "name": "max_ArrDelay" + }, + { + "dataType": "BYTES", + "maxLength": 16384, + "name": "raw_hll_FlightNum" + }, + { + "dataType": "BYTES", + "maxLength": 16384, + "name": "raw_hllplus_FlightNum" + } + ], + "dateTimeFieldSpecs": [ + { + "name": "tsMs", + "dataType": "TIMESTAMP", + "format": "1:MILLISECONDS:TIMESTAMP", + "granularity": "1:MILLISECONDS" + } + ] +} diff --git a/pom.xml b/pom.xml index c773acffc2cc..359224a5a2e0 100644 --- a/pom.xml +++ b/pom.xml @@ -46,6 +46,7 @@ pinot-clients pinot-server pinot-core + pinot-materialized-view pinot-controller pinot-minion pinot-plugins @@ -637,6 +638,11 @@ pinot-core ${project.version} + + org.apache.pinot + pinot-materialized-view + ${project.version} + org.apache.pinot pinot-query-planner