From 60877d15cb179df9615b4ac14620a89bce725d08 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 14 Sep 2023 14:55:37 -0700 Subject: [PATCH 01/19] add tenant partition map rest resource add tenant partition map rest resource fix logic refactor and fix some logic pre-configuration based assignment --- .../InstanceAssignmentConfigUtils.java | 10 + ...inotInstanceAssignmentRestletResource.java | 30 +- .../resources/PinotTenantRestletResource.java | 72 +- .../helix/core/PinotHelixResourceManager.java | 11 + .../instance/InstanceAssignmentDriver.java | 21 +- .../InstancePartitionSelectorFactory.java | 12 +- ...reConfiguredInstancePartitionSelector.java | 304 ++++++++ .../helix/core/rebalance/TableRebalancer.java | 52 +- .../instance/InstanceAssignmentTest.java | 726 +++++++++++++++++- .../org/apache/pinot/core/auth/Actions.java | 2 + .../assignment/InstanceAssignmentConfig.java | 3 +- 11 files changed, 1208 insertions(+), 35 deletions(-) create mode 100644 pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/PreConfiguredInstancePartitionSelector.java diff --git a/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java index b37429c527f..cd056f2ca2e 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java @@ -124,4 +124,14 @@ public static InstanceAssignmentConfig getInstanceAssignmentConfig(TableConfig t return new InstanceAssignmentConfig(tagPoolConfig, null, replicaGroupPartitionConfig); } + + public static boolean isPreConfigurationBasedAssignment(TableConfig tableConfig, + InstancePartitionsType instancePartitionsType) { + // If the instance assignment config is not null and the partition selector is + // PRE_CONFIGURATION_BASED_PARTITION_SELECTOR, + return tableConfig.getInstanceAssignmentConfigMap().get(instancePartitionsType.toString()) != null + && InstanceAssignmentConfigUtils.getInstanceAssignmentConfig(tableConfig, instancePartitionsType) + .getPartitionSelector() + == InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR; + } } diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java index 3eeb6665a49..38d3af7cbde 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java @@ -69,7 +69,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.pinot.spi.utils.CommonConstants.SWAGGER_AUTHORIZATION_KEY; +import static org.apache.pinot.spi.utils.CommonConstants.*; @Api(tags = Constants.TABLE_TAG, authorizations = {@Authorization(value = SWAGGER_AUTHORIZATION_KEY)}) @@ -244,20 +244,32 @@ public Map assignInstances( private void assignInstancesForInstancePartitionsType(Map instancePartitionsMap, TableConfig tableConfig, List instanceConfigs, InstancePartitionsType instancePartitionsType) { String tableNameWithType = tableConfig.getTableName(); - if (TableConfigUtils.hasPreConfiguredInstancePartitions(tableConfig, instancePartitionsType)) { + if (!TableConfigUtils.hasPreConfiguredInstancePartitions(tableConfig, instancePartitionsType)) { + InstancePartitions existingInstancePartitions = + InstancePartitionsUtils.fetchInstancePartitions(_resourceManager.getHelixZkManager().getHelixPropertyStore(), + InstancePartitionsUtils.getInstancePartitionsName(tableNameWithType, instancePartitionsType.toString())); + instancePartitionsMap.put(instancePartitionsType.toString(), + new InstanceAssignmentDriver(tableConfig).assignInstances(instancePartitionsType, instanceConfigs, + existingInstancePartitions)); + } else if (InstanceAssignmentConfigUtils.isPreConfigurationBasedAssignment(tableConfig, instancePartitionsType)) { + InstancePartitions existingInstancePartitions = + InstancePartitionsUtils.fetchInstancePartitions(_resourceManager.getHelixZkManager().getHelixPropertyStore(), + InstancePartitionsUtils.getInstancePartitionsName(tableNameWithType, instancePartitionsType.toString())); + String rawTableName = TableNameBuilder.extractRawTableName(tableNameWithType); + InstancePartitions preConfigured = + InstancePartitionsUtils.fetchInstancePartitionsWithRename(_resourceManager.getPropertyStore(), + tableConfig.getInstancePartitionsMap().get(instancePartitionsType), + instancePartitionsType.getInstancePartitionsName(rawTableName)); + instancePartitionsMap.put(instancePartitionsType.toString(), + new InstanceAssignmentDriver(tableConfig).assignInstances(instancePartitionsType, instanceConfigs, + existingInstancePartitions, preConfigured)); + } else { String rawTableName = TableNameBuilder.extractRawTableName(tableNameWithType); instancePartitionsMap.put(instancePartitionsType.toString(), InstancePartitionsUtils.fetchInstancePartitionsWithRename(_resourceManager.getPropertyStore(), tableConfig.getInstancePartitionsMap().get(instancePartitionsType), instancePartitionsType.getInstancePartitionsName(rawTableName))); - return; } - InstancePartitions existingInstancePartitions = - InstancePartitionsUtils.fetchInstancePartitions(_resourceManager.getHelixZkManager().getHelixPropertyStore(), - InstancePartitionsUtils.getInstancePartitionsName(tableNameWithType, instancePartitionsType.toString())); - instancePartitionsMap.put(instancePartitionsType.toString(), - new InstanceAssignmentDriver(tableConfig).assignInstances(instancePartitionsType, instanceConfigs, - existingInstancePartitions)); } private void assignInstancesForTier(Map instancePartitionsMap, TableConfig tableConfig, diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java index 3aa5da48e91..16faa3ea0d1 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java @@ -21,6 +21,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.base.Preconditions; import io.swagger.annotations.Api; import io.swagger.annotations.ApiKeyAuthDefinition; import io.swagger.annotations.ApiOperation; @@ -30,6 +31,7 @@ import io.swagger.annotations.Authorization; import io.swagger.annotations.SecurityDefinition; import io.swagger.annotations.SwaggerDefinition; +import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -50,6 +52,8 @@ import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; import org.apache.helix.model.InstanceConfig; +import org.apache.pinot.common.assignment.InstancePartitions; +import org.apache.pinot.common.assignment.InstancePartitionsUtils; import org.apache.pinot.common.metadata.controllerjob.ControllerJobType; import org.apache.pinot.common.metrics.ControllerMeter; import org.apache.pinot.common.metrics.ControllerMetrics; @@ -68,13 +72,14 @@ import org.apache.pinot.core.auth.TargetType; import org.apache.pinot.spi.config.table.TableConfig; import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.config.table.assignment.InstancePartitionsType; import org.apache.pinot.spi.config.tenant.Tenant; import org.apache.pinot.spi.config.tenant.TenantRole; import org.apache.pinot.spi.utils.JsonUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.pinot.spi.utils.CommonConstants.SWAGGER_AUTHORIZATION_KEY; +import static org.apache.pinot.spi.utils.CommonConstants.*; /** @@ -286,6 +291,71 @@ public String getTablesOnTenant( } } + @GET + @Path("/tenants/{tenantName}/instancePartitions") + @Authorize(targetType = TargetType.CLUSTER, action = Actions.Cluster.GET_INSTANCE_PARTITION) + @Authenticate(AccessType.READ) + @Produces(MediaType.APPLICATION_JSON) + @ApiOperation(value = "Get the instance partitions of a tenant") + @ApiResponses(value = {@ApiResponse(code = 200, message = "Success"), + @ApiResponse(code = 500, message = "Instance partitions not found")}) + public InstancePartitions getInstancePartitions( + @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, + @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) @QueryParam("serverType") String serverType) { + String tenantNameWithType = InstancePartitionsType.valueOf(serverType).getInstancePartitionsName(tenantName); + InstancePartitions instancePartitions = + InstancePartitionsUtils.fetchInstancePartitions(_pinotHelixResourceManager.getPropertyStore(), + tenantNameWithType); + + if (instancePartitions == null) { + throw new ControllerApplicationException(LOGGER, "Failed to find the instance partitions", + Response.Status.NOT_FOUND); + } else { + return instancePartitions; + } + } + + @PUT + @Path("/tenants/{tenantName}/instancePartitions") + @Authorize(targetType = TargetType.CLUSTER, action = Actions.Cluster.UPDATE_INSTANCE_PARTITION) + @Authenticate(AccessType.UPDATE) + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + @ApiOperation(value = "Update an instance partition for a server type in a tenant") + @ApiResponses(value = {@ApiResponse(code = 200, message = "Success"), + @ApiResponse(code = 500, message = "Failed to update the tenant")}) + public InstancePartitions assignInstancesPartitionMap( + @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, + @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) @QueryParam("serverType") String serverType, + String instancePartitionsStr) { + InstancePartitions instancePartitions; + try { + instancePartitions = JsonUtils.stringToObject(instancePartitionsStr, InstancePartitions.class); + } catch (IOException e) { + throw new ControllerApplicationException(LOGGER, "Failed to deserialize the instance partitions", + Response.Status.BAD_REQUEST); + } + + String tenantNameWithType = InstancePartitionsType.valueOf(serverType).getInstancePartitionsName(tenantName); + Preconditions.checkState(instancePartitions.getInstancePartitionsName().equals(tenantNameWithType), + "Instance partitions name mismatch, expected: %s, got: %s", tenantNameWithType, + instancePartitions.getInstancePartitionsName()); + + persistInstancePartitionsHelper(instancePartitions); + return instancePartitions; + } + + private void persistInstancePartitionsHelper(InstancePartitions instancePartitions) { + try { + LOGGER.info("Persisting instance partitions: {}", instancePartitions); + InstancePartitionsUtils.persistInstancePartitions(_pinotHelixResourceManager.getPropertyStore(), + instancePartitions); + } catch (Exception e) { + throw new ControllerApplicationException(LOGGER, "Caught Exception while persisting the instance partitions", + Response.Status.INTERNAL_SERVER_ERROR, e); + } + } + private String getTablesServedFromServerTenant(String tenantName) { Set tables = new HashSet<>(); ObjectNode resourceGetRet = JsonUtils.newObjectNode(); diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java index 860a9e1f830..f47a3e93ee0 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java @@ -1735,11 +1735,22 @@ private void assignInstances(TableConfig tableConfig, boolean override) { for (InstancePartitionsType instancePartitionsType : instancePartitionsTypesToAssign) { boolean hasPreConfiguredInstancePartitions = TableConfigUtils.hasPreConfiguredInstancePartitions(tableConfig, instancePartitionsType); + boolean isPreConfigurationBasedAssignment = + InstanceAssignmentConfigUtils.isPreConfigurationBasedAssignment(tableConfig, instancePartitionsType); InstancePartitions instancePartitions; if (!hasPreConfiguredInstancePartitions) { instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, instanceConfigs, null); LOGGER.info("Persisting instance partitions: {}", instancePartitions); InstancePartitionsUtils.persistInstancePartitions(_propertyStore, instancePartitions); + } else if (isPreConfigurationBasedAssignment) { + String referenceInstancePartitionsName = tableConfig.getInstancePartitionsMap().get(instancePartitionsType); + InstancePartitions preConfiguredInstancePartitions = + InstancePartitionsUtils.fetchInstancePartitionsWithRename(_propertyStore, referenceInstancePartitionsName, + instancePartitionsType.getInstancePartitionsName(rawTableName)); + instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, instanceConfigs, null, + preConfiguredInstancePartitions); + LOGGER.info("Persisting instance partitions: {}", instancePartitions); + InstancePartitionsUtils.persistInstancePartitions(_propertyStore, instancePartitions); } else { String referenceInstancePartitionsName = tableConfig.getInstancePartitionsMap().get(instancePartitionsType); instancePartitions = diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentDriver.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentDriver.java index 7a5c9010293..6d869b86c16 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentDriver.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentDriver.java @@ -60,19 +60,31 @@ public InstancePartitions assignInstances(InstancePartitionsType instancePartiti InstanceAssignmentConfigUtils.getInstanceAssignmentConfig(_tableConfig, instancePartitionsType); return getInstancePartitions( instancePartitionsType.getInstancePartitionsName(TableNameBuilder.extractRawTableName(tableNameWithType)), - assignmentConfig, instanceConfigs, existingInstancePartitions); + assignmentConfig, instanceConfigs, existingInstancePartitions, null); + } + + public InstancePartitions assignInstances(InstancePartitionsType instancePartitionsType, + List instanceConfigs, @Nullable InstancePartitions existingInstancePartitions, @Nullable + InstancePartitions preConfiguredInstancePartitions) { + String tableNameWithType = _tableConfig.getTableName(); + InstanceAssignmentConfig assignmentConfig = + InstanceAssignmentConfigUtils.getInstanceAssignmentConfig(_tableConfig, instancePartitionsType); + return getInstancePartitions( + instancePartitionsType.getInstancePartitionsName(TableNameBuilder.extractRawTableName(tableNameWithType)), + assignmentConfig, instanceConfigs, existingInstancePartitions, preConfiguredInstancePartitions); } public InstancePartitions assignInstances(String tierName, List instanceConfigs, @Nullable InstancePartitions existingInstancePartitions, InstanceAssignmentConfig instanceAssignmentConfig) { return getInstancePartitions( InstancePartitionsUtils.getInstancePartitionsNameForTier(_tableConfig.getTableName(), tierName), - instanceAssignmentConfig, instanceConfigs, existingInstancePartitions); + instanceAssignmentConfig, instanceConfigs, existingInstancePartitions, null); } private InstancePartitions getInstancePartitions(String instancePartitionsName, InstanceAssignmentConfig instanceAssignmentConfig, List instanceConfigs, - @Nullable InstancePartitions existingInstancePartitions) { + @Nullable InstancePartitions existingInstancePartitions, + @Nullable InstancePartitions preConfiguredInstancePartitions) { String tableNameWithType = _tableConfig.getTableName(); LOGGER.info("Starting {} instance assignment for table {}", instancePartitionsName, tableNameWithType); @@ -93,7 +105,8 @@ private InstancePartitions getInstancePartitions(String instancePartitionsName, InstancePartitionSelector instancePartitionSelector = InstancePartitionSelectorFactory.getInstance(instanceAssignmentConfig.getPartitionSelector(), - instanceAssignmentConfig.getReplicaGroupPartitionConfig(), tableNameWithType, existingInstancePartitions); + instanceAssignmentConfig.getReplicaGroupPartitionConfig(), tableNameWithType, existingInstancePartitions, + preConfiguredInstancePartitions); InstancePartitions instancePartitions = new InstancePartitions(instancePartitionsName); instancePartitionSelector.selectInstances(poolToInstanceConfigsMap, instancePartitions); return instancePartitions; diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstancePartitionSelectorFactory.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstancePartitionSelectorFactory.java index f786ffe0b41..a49727c15a5 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstancePartitionSelectorFactory.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/InstancePartitionSelectorFactory.java @@ -31,7 +31,14 @@ private InstancePartitionSelectorFactory() { public static InstancePartitionSelector getInstance(InstanceAssignmentConfig.PartitionSelector partitionSelector, InstanceReplicaGroupPartitionConfig instanceReplicaGroupPartitionConfig, String tableNameWithType, - InstancePartitions existingInstancePartitions + InstancePartitions existingInstancePartitions) { + return getInstance(partitionSelector, instanceReplicaGroupPartitionConfig, tableNameWithType, + existingInstancePartitions, null); + } + + public static InstancePartitionSelector getInstance(InstanceAssignmentConfig.PartitionSelector partitionSelector, + InstanceReplicaGroupPartitionConfig instanceReplicaGroupPartitionConfig, String tableNameWithType, + InstancePartitions existingInstancePartitions, InstancePartitions preConfiguredInstancePartitions ) { switch (partitionSelector) { case FD_AWARE_INSTANCE_PARTITION_SELECTOR: @@ -40,6 +47,9 @@ public static InstancePartitionSelector getInstance(InstanceAssignmentConfig.Par case INSTANCE_REPLICA_GROUP_PARTITION_SELECTOR: return new InstanceReplicaGroupPartitionSelector(instanceReplicaGroupPartitionConfig, tableNameWithType, existingInstancePartitions); + case PRE_CONFIGURATION_BASED_PARTITION_SELECTOR: + return new PreConfiguredInstancePartitionSelector(instanceReplicaGroupPartitionConfig, tableNameWithType, + existingInstancePartitions, preConfiguredInstancePartitions); default: throw new IllegalStateException("Unexpected PartitionSelector: " + partitionSelector + ", should be from" + Arrays.toString(InstanceAssignmentConfig.PartitionSelector.values())); diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/PreConfiguredInstancePartitionSelector.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/PreConfiguredInstancePartitionSelector.java new file mode 100644 index 00000000000..1b3f6f13896 --- /dev/null +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/PreConfiguredInstancePartitionSelector.java @@ -0,0 +1,304 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.controller.helix.core.assignment.instance; + +import com.google.common.base.Preconditions; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Random; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.apache.helix.model.InstanceConfig; +import org.apache.pinot.common.assignment.InstancePartitions; +import org.apache.pinot.spi.config.table.assignment.InstanceReplicaGroupPartitionConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class PreConfiguredInstancePartitionSelector extends InstancePartitionSelector { + private static final Logger LOGGER = LoggerFactory.getLogger(PreConfiguredInstancePartitionSelector.class); + private final InstancePartitions _preConfiguredInstancePartitions; + + // dimensions of target instance partition + private final int _numTargetInstancesPerReplicaGroup; + private final int _numTargetReplicaGroups; + private final int _numTargetTotalInstances; + + // dimensions of pre-configured instance partition + private int _numPreConfiguredReplicaGroups; + private int _numPreConfiguredInstancesPerReplicaGroup; + + // dimensions of existing instance partition + private int _numExistingReplicaGroups; + private int _numExistingInstancesPerReplicaGroup; + + // look up tables for pre-configured instance partition + private final List> _preConfiguredMirroredServerLists = new ArrayList<>(); + private final Map _preConfiguredInstanceNameToOffsetMap = new HashMap<>(); + + private final List> _existingMirroredServerLists = new ArrayList<>(); + + public PreConfiguredInstancePartitionSelector(InstanceReplicaGroupPartitionConfig replicaGroupPartitionConfig, + String tableNameWithType, InstancePartitions existingInstancePartitions, + InstancePartitions preConfiguredInstancePartitions) { + super(replicaGroupPartitionConfig, tableNameWithType, existingInstancePartitions); + _preConfiguredInstancePartitions = preConfiguredInstancePartitions; + _numTargetInstancesPerReplicaGroup = _replicaGroupPartitionConfig.getNumInstancesPerReplicaGroup(); + _numTargetReplicaGroups = _replicaGroupPartitionConfig.getNumReplicaGroups(); + _numTargetTotalInstances = _numTargetInstancesPerReplicaGroup * _numTargetReplicaGroups; + } + + /** + * validate if the poolToInstanceConfigsMap is a valid input for pre-configuration based replica-group selection + */ + private void validatePoolDiversePreconditions(Map> poolToInstanceConfigsMap) { + + LOGGER.info("Validating pre-configured instance partitions for pre-configuration based replica-group selection"); + + // numTargetInstancesPerReplica should be positive + Preconditions.checkState(_numTargetInstancesPerReplicaGroup > 0, + "Number of instances per replica must be positive"); + // _numTargetReplicaGroups should be positive + Preconditions.checkState(_numTargetReplicaGroups > 0, "Number of replica-groups must be positive"); + // validate target partition count is 1 + Preconditions.checkState(_replicaGroupPartitionConfig.getNumPartitions() <= 1, + "This algorithm does not support table level partitioning for target assignment"); + + // Validate the existing instance partitions is null or has only one partition + Preconditions.checkState( + (_existingInstancePartitions == null || _existingInstancePartitions.getNumPartitions() == 1), + "This algorithm does not support table level partitioning for existing assignment"); + + _numExistingReplicaGroups = + _existingInstancePartitions == null ? 0 : _existingInstancePartitions.getNumReplicaGroups(); + _numExistingInstancesPerReplicaGroup = + _existingInstancePartitions == null ? 0 : _existingInstancePartitions.getInstances(0, 0).size(); + + // Validate the pre-configured instance partitions is not null and has only one partition + Preconditions.checkState(_preConfiguredInstancePartitions != null, + "Pre-configured instance partitions must be provided for pre-configuration based selection"); + Preconditions.checkState(_preConfiguredInstancePartitions.getNumPartitions() == 1, + "This algorithm does not support table level partitioning for pre-configured assignment"); + + // Validate the number of replica-groups in the pre-configured instance partitions is equal to the target + // number of replica-groups + _numPreConfiguredReplicaGroups = _preConfiguredInstancePartitions.getNumReplicaGroups(); + Preconditions.checkState(_numPreConfiguredReplicaGroups == _numTargetReplicaGroups, + "The number of replica-groups %s in the pre-configured instance partitions " + + "is not equal to the target number of replica-groups %s", _numPreConfiguredReplicaGroups, + _numTargetReplicaGroups); + // Validate the number of instances per replica-group in the pre-configured instance partitions is greater than or + // equal to the target number of instances per replica-group + _numPreConfiguredInstancesPerReplicaGroup = _preConfiguredInstancePartitions.getInstances(0, 0).size(); + Preconditions.checkState(_numPreConfiguredInstancesPerReplicaGroup >= _numTargetInstancesPerReplicaGroup, + "The number of instances per replica-group in the pre-configured " + + "instance partitions is less than the target number of instances per replica-group"); + + // Validate the pool to instance configs map is not null or empty + Preconditions.checkNotNull(poolToInstanceConfigsMap, "poolToInstanceConfigsMap is null"); + int numPools = poolToInstanceConfigsMap.size(); + Preconditions.checkState(numPools > 0, "No pool qualified for selection"); + Preconditions.checkState(poolToInstanceConfigsMap.values().stream().map(List::size).reduce(Integer::sum) + .orElse(0) >= _numTargetTotalInstances, + "The total number of instances in all pools is less than the target number of target instances"); + + HashSet availableInstanceSet = new HashSet<>(); + poolToInstanceConfigsMap.values().forEach(list -> list.forEach(i -> availableInstanceSet.add(i.getInstanceName()))); + + for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { + List instances = _preConfiguredInstancePartitions.getInstances(0, i); + for (String instance : instances) { + Preconditions.checkState(availableInstanceSet.contains(instance), + "Instance %s in pre-configured instance " + "partitions is not in the pool to instance configs map", + instance); + } + } + + LOGGER.info("Validation passed. The instances provided can satisfy the pool diverse requirement."); + LOGGER.info("Trying to assign total {} instances to {} replica groups, " + "with {} instance per replica group", + _numTargetTotalInstances, _numTargetReplicaGroups, _numTargetInstancesPerReplicaGroup); + } + + void createListFromPreConfiguredInstanceAssignmentMap() { + List> preConfiguredReplicaGroups = new ArrayList<>(_numPreConfiguredReplicaGroups); + for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { + preConfiguredReplicaGroups.add(_preConfiguredInstancePartitions.getInstances(0, i)); + } + + for (int j = 0; j < _numPreConfiguredInstancesPerReplicaGroup; j++) { + List mirroredServerList = new ArrayList<>(); + for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { + mirroredServerList.add(preConfiguredReplicaGroups.get(i).get(j)); + } + _preConfiguredMirroredServerLists.add(mirroredServerList); + } + } + + void createLookupTablesFromPreConfiguredInstanceAssignmentMap() { + List> preConfiguredReplicaGroups = new ArrayList<>(_numPreConfiguredReplicaGroups); + for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { + preConfiguredReplicaGroups.add(_preConfiguredInstancePartitions.getInstances(0, i)); + } + + for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { + for (int j = 0; j < _numPreConfiguredInstancesPerReplicaGroup; j++) { + String instance = preConfiguredReplicaGroups.get(i).get(j); + _preConfiguredInstanceNameToOffsetMap.put(instance, j); + } + } + } + + @Override + void selectInstances(Map> poolToInstanceConfigsMap, + InstancePartitions instancePartitions) { + if (_replicaGroupPartitionConfig.isReplicaGroupBased()) { + validatePoolDiversePreconditions(poolToInstanceConfigsMap); + if (_existingInstancePartitions == null) { + // If no existing instance partitions, create new instance partitions based on the pre-configured instance + // partitions. This is done by just selecting _targetNumInstancesPerReplicaGroup set of mirrored servers + // from the pre-configured instance partitions. + LOGGER.info("No existing instance partitions found. Will build new on top of" + + " the pre-configured instance partitions"); + // create a list of lists of mirrored servers from the pre-configured instance partitions + createListFromPreConfiguredInstanceAssignmentMap(); + // shuffle the list of lists of mirrored servers based on the table name hash + int tableNameHash = Math.abs(_tableNameWithType.hashCode()); + Collections.shuffle(_preConfiguredMirroredServerLists, new Random(tableNameHash)); + + // create the instance partitions based on the rotated list of mirrored servers + List> resultReplicaGroups = new ArrayList<>(_numTargetReplicaGroups); + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.add(new ArrayList<>(_numTargetInstancesPerReplicaGroup)); + } + for (int j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.get(i).add(_preConfiguredMirroredServerLists.get(j).get(i)); + } + } + for (int i = 0; i < _numTargetReplicaGroups; i++) { + instancePartitions.setInstances(0, i, resultReplicaGroups.get(i)); + } + } else { + // If existing instance partitions exist, adjust the existing instance partitions based on the pre-configured + // instance partitions. This code path takes care of instance replacement, uplift, and downlift. + // This is done by search in the pre-configured instance partitions for the mirrored + // servers sets that are similar to the existing sets in instance partitions. + LOGGER.info("Existing instance partitions found. Will adjust the existing instance partitions" + + " based on the pre-configured instance partitions"); + createListFromPreConfiguredInstanceAssignmentMap(); + createLookupTablesFromPreConfiguredInstanceAssignmentMap(); + createListAndLookupTablesFromExistingInstancePartitions(); + Set usedPreconfiguredInstanceOffsets = new HashSet<>(); + Map> existingOffsetToResultTuple = new HashMap<>(); + + // For each instance offset, find the mirrored server that is most similar to the existing mirrored server + // set. If the mirrored server is not used, add it to the result list. + for (int j = 0; j < _numExistingInstancesPerReplicaGroup; j++) { + List existingMirroredServers = _existingMirroredServerLists.get(j); + int finalJ = j; + existingMirroredServers.stream() + .map(_preConfiguredInstanceNameToOffsetMap::get) + .filter(Objects::nonNull) + .filter(offset -> !usedPreconfiguredInstanceOffsets.contains(offset)) + .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())) + .entrySet().stream().max(Map.Entry.comparingByValue()).ifPresent(e -> { + existingOffsetToResultTuple.put(finalJ, e); + usedPreconfiguredInstanceOffsets.add(e.getKey()); + }); + } + + if (_numExistingInstancesPerReplicaGroup > _numTargetInstancesPerReplicaGroup) { + // If this is a downlift case + List> collect = existingOffsetToResultTuple.values() + .stream() + .sorted((a, b) -> b.getValue().compareTo(a.getValue())) + .limit(_numTargetInstancesPerReplicaGroup) + .collect(Collectors.toList()); + int size = collect.size(); + existingOffsetToResultTuple.clear(); + usedPreconfiguredInstanceOffsets.clear(); + for (int j = 0; j < size; j++) { + existingOffsetToResultTuple.put(j, collect.get(j)); + usedPreconfiguredInstanceOffsets.add(collect.get(j).getKey()); + } + } + + if (existingOffsetToResultTuple.size() < _numTargetInstancesPerReplicaGroup) { + // If the number of instances selected from the result list is less than the target number + // of instances per replica group, add the remaining instances from the pre-configured instance partitions. + ArrayList shuffledOffsets = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); + for (int j = 0; j < _numPreConfiguredInstancesPerReplicaGroup; j++) { + shuffledOffsets.add(j); + } + Collections.shuffle(shuffledOffsets, new Random(Math.abs(_tableNameWithType.hashCode()))); + for (int k = 0, j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { + if (existingOffsetToResultTuple.containsKey(j)) { + continue; + } + while (usedPreconfiguredInstanceOffsets.contains(shuffledOffsets.get(k))) { + k++; + } + Integer offset = shuffledOffsets.get(k); + existingOffsetToResultTuple.put(j, new AbstractMap.SimpleEntry<>(offset, 0L)); + usedPreconfiguredInstanceOffsets.add(offset); + } + } + + List> resultReplicaGroups = new ArrayList<>(_numTargetReplicaGroups); + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.add(new ArrayList<>(_numTargetInstancesPerReplicaGroup)); + } + for (int j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { + List mirrorServers = + _preConfiguredMirroredServerLists.get(existingOffsetToResultTuple.get(j).getKey()); + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.get(i).add(mirrorServers.get(i)); + } + } + for (int i = 0; i < _numTargetReplicaGroups; i++) { + instancePartitions.setInstances(0, i, resultReplicaGroups.get(i)); + } + } + } else { + throw new IllegalStateException("Does not support Non-replica-group based selection"); + } + } + + private void createListAndLookupTablesFromExistingInstancePartitions() { + List> existingReplicaGroups = new ArrayList<>(_numExistingReplicaGroups); + for (int i = 0; i < _numExistingReplicaGroups; i++) { + existingReplicaGroups.add(_existingInstancePartitions.getInstances(0, i)); + } + + for (int j = 0; j < _numExistingInstancesPerReplicaGroup; j++) { + List existingMirroredServerList = new ArrayList<>(); + for (int i = 0; i < _numExistingReplicaGroups; i++) { + existingMirroredServerList.add(existingReplicaGroups.get(i).get(j)); + } + _existingMirroredServerLists.add(existingMirroredServerList); + } + } +} diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java index a86c3e27a16..e3786b33d26 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java @@ -527,31 +527,51 @@ private Pair getInstancePartitions(TableConfig tabl if (InstanceAssignmentConfigUtils.allowInstanceAssignment(tableConfig, instancePartitionsType)) { boolean hasPreConfiguredInstancePartitions = TableConfigUtils.hasPreConfiguredInstancePartitions(tableConfig, instancePartitionsType); - if (hasPreConfiguredInstancePartitions) { + boolean isPreConfigurationBasedAssignment = + InstanceAssignmentConfigUtils.isPreConfigurationBasedAssignment(tableConfig, instancePartitionsType); + InstanceAssignmentDriver instanceAssignmentDriver = new InstanceAssignmentDriver(tableConfig); + InstancePartitions instancePartitions; + boolean instancePartitionsUnchanged; + if (!hasPreConfiguredInstancePartitions) { + LOGGER.info("Reassigning {} instances for table: {}", instancePartitionsType, tableNameWithType); + // Assign instances with existing instance partition to null if bootstrap mode is enabled, so that the + // instance partition map can be fully recalculated. + instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, + _helixDataAccessor.getChildValues(_helixDataAccessor.keyBuilder().instanceConfigs(), true), + bootstrap ? null : existingInstancePartitions); + instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); + if (!dryRun && !instancePartitionsUnchanged) { + LOGGER.info("Persisting instance partitions: {} to ZK", instancePartitions); + InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), + instancePartitions); + } + } else if (isPreConfigurationBasedAssignment) { + String referenceInstancePartitionsName = tableConfig.getInstancePartitionsMap().get(instancePartitionsType); + InstancePartitions preConfiguredInstancePartitions = + InstancePartitionsUtils.fetchInstancePartitionsWithRename(_helixManager.getHelixPropertyStore(), + referenceInstancePartitionsName, instancePartitionsName); + instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, + _helixDataAccessor.getChildValues(_helixDataAccessor.keyBuilder().instanceConfigs(), true), + bootstrap ? null : existingInstancePartitions, preConfiguredInstancePartitions); + instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); + if (!dryRun && !instancePartitionsUnchanged) { + LOGGER.info("Persisting instance partitions: {} (based on {})", instancePartitions, + preConfiguredInstancePartitions); + InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), + instancePartitions); + } + } else { String referenceInstancePartitionsName = tableConfig.getInstancePartitionsMap().get(instancePartitionsType); - InstancePartitions instancePartitions = + instancePartitions = InstancePartitionsUtils.fetchInstancePartitionsWithRename(_helixManager.getHelixPropertyStore(), referenceInstancePartitionsName, instancePartitionsName); - boolean instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); + instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); if (!dryRun && !instancePartitionsUnchanged) { LOGGER.info("Persisting instance partitions: {} (referencing {})", instancePartitions, referenceInstancePartitionsName); InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), instancePartitions); } - return Pair.of(instancePartitions, instancePartitionsUnchanged); - } - LOGGER.info("Reassigning {} instances for table: {}", instancePartitionsType, tableNameWithType); - InstanceAssignmentDriver instanceAssignmentDriver = new InstanceAssignmentDriver(tableConfig); - // Assign instances with existing instance partition to null if bootstrap mode is enabled, so that the instance - // partition map can be fully recalculated. - InstancePartitions instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, - _helixDataAccessor.getChildValues(_helixDataAccessor.keyBuilder().instanceConfigs(), true), - bootstrap ? null : existingInstancePartitions); - boolean instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); - if (!dryRun && !instancePartitionsUnchanged) { - LOGGER.info("Persisting instance partitions: {} to ZK", instancePartitions); - InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), instancePartitions); } return Pair.of(instancePartitions, instancePartitionsUnchanged); } else { diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java index 4335d80b147..5b1007ce996 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java @@ -18,12 +18,20 @@ */ package org.apache.pinot.controller.helix.core.assignment.instance; +import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.LinkedList; import java.util.List; +import java.util.Random; +import java.util.Set; import org.apache.helix.model.InstanceConfig; +import org.apache.log4j.Level; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.pinot.common.assignment.InstanceAssignmentConfigUtils; import org.apache.pinot.common.assignment.InstancePartitions; import org.apache.pinot.common.utils.config.InstanceUtils; @@ -42,9 +50,7 @@ import org.apache.pinot.spi.utils.builder.TableConfigBuilder; import org.testng.annotations.Test; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.fail; +import static org.testng.Assert.*; public class InstanceAssignmentTest { @@ -54,6 +60,7 @@ public class InstanceAssignmentTest { private static final String SERVER_INSTANCE_ID_PREFIX = "Server_localhost_"; private static final String SERVER_INSTANCE_POOL_PREFIX = "_pool_"; private static final String TABLE_NAME_ZERO_HASH_COMPLEMENT = "12"; + public static final Logger LOGGER = LogManager.getLogger(InstanceAssignmentTest.class); @Test public void testDefaultOfflineReplicaGroup() { @@ -329,6 +336,719 @@ public void testDefaultOfflineReplicaGroup() { Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 0)); } + public void testPreConfigurationBasedRandom() throws FileNotFoundException { + testPreConfigurationBasedRandomInner(10000000); + } + + public void testPreConfigurationBasedRandomInner(int loopCount) throws FileNotFoundException { + PrintStream o = new PrintStream("output.txt"); + System.setOut(o); + for (int iter = 0; iter < loopCount; iter++) { + System.out.printf("_____________________________ITERATION:%d________________________________%n", iter); + Random random1 = new Random(); + int numTargetReplicaGroups = random1.nextInt(7) + 1; + int numExistingReplicaGroups = random1.nextInt(7) + 1; + int numPreConfiguredInstancesPerReplicaGroup = random1.nextInt(10) + 5; + int numTargetInstancesPerReplicaGroup = Math.max(random1.nextInt(numPreConfiguredInstancesPerReplicaGroup), 5); + int numExistingInstancesPerReplicaGroup = Math.max(random1.nextInt(numPreConfiguredInstancesPerReplicaGroup), 5); + int numPools = random1.nextInt(10) + 1; + + int numPartitions = 0; + int numInstancesPerPartition = 0; + List instanceConfigs = new ArrayList<>(); + + int preConfiguredOffsetStart = random1.nextInt(10); + for (int i = 0; i < 1000; i++) { + int pool = i % numPools; + InstanceConfig instanceConfig = new InstanceConfig(SERVER_INSTANCE_ID_PREFIX + i); + instanceConfig.addTag(OFFLINE_TAG); + instanceConfig.getRecord() + .setMapField(InstanceUtils.POOL_KEY, Collections.singletonMap(OFFLINE_TAG, Integer.toString(pool))); + instanceConfigs.add(instanceConfig); + } + InstanceTagPoolConfig tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + InstanceReplicaGroupPartitionConfig replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numTargetReplicaGroups, numTargetInstancesPerReplicaGroup, + numPartitions, numInstancesPerPartition, false, null); + + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")) + .build(); + InstanceAssignmentDriver driver = new InstanceAssignmentDriver(tableConfig); + InstancePartitions preConfigured = new InstancePartitions("preConfigured"); + InstancePartitions existing = new InstancePartitions("existing"); + + List preconfiguredInstances = new LinkedList<>(); + List existingInstances = new LinkedList<>(); + + Set preConfiguredUsed = new HashSet<>(); + Set existingUsed = new HashSet<>(); + + for (int i = 0; i < numTargetReplicaGroups; i++) { + for (int j = 0; j < numPreConfiguredInstancesPerReplicaGroup; j++) { + int instance = + random1.nextInt((int) (1.5 * numTargetReplicaGroups * numPreConfiguredInstancesPerReplicaGroup)); + while (preConfiguredUsed.contains(instance)) { + instance = random1.nextInt((int) (1.5 * numTargetReplicaGroups * numPreConfiguredInstancesPerReplicaGroup)); + } + preConfiguredUsed.add(instance); + preconfiguredInstances.add(SERVER_INSTANCE_ID_PREFIX + (instance + preConfiguredOffsetStart)); + } + } + + for (int i = 0; i < numExistingReplicaGroups; i++) { + for (int j = 0; j < numExistingInstancesPerReplicaGroup; j++) { + int instance = random1.nextInt((int) (1.5 * numExistingReplicaGroups * numExistingInstancesPerReplicaGroup)); + while (existingUsed.contains(instance)) { + instance = random1.nextInt((int) (1.5 * numExistingReplicaGroups * numExistingInstancesPerReplicaGroup)); + } + existingUsed.add(instance); + existingInstances.add(SERVER_INSTANCE_ID_PREFIX + instance); + } + } + + Collections.shuffle(preconfiguredInstances); + Collections.shuffle(existingInstances); + + for (int i = 0; i < numTargetReplicaGroups; i++) { + preConfigured.setInstances(0, i, preconfiguredInstances.subList(i * numPreConfiguredInstancesPerReplicaGroup, + (i + 1) * numPreConfiguredInstancesPerReplicaGroup)); + } + + for (int i = 0; i < numExistingReplicaGroups; i++) { + existing.setInstances(0, i, existingInstances.subList(i * numExistingInstancesPerReplicaGroup, + (i + 1) * numExistingInstancesPerReplicaGroup)); + } + + System.out.println("Done initializing preconfigured and existing instances"); + System.out.println("numTargetReplicaGroups " + numTargetReplicaGroups); + System.out.println("numPreConfiguredInstancesPerReplicaGroup " + numPreConfiguredInstancesPerReplicaGroup); + System.out.println("numTargetInstancesPerReplicaGroup " + numTargetInstancesPerReplicaGroup); + + System.out.println("numExistingReplicaGroups " + numExistingReplicaGroups); + System.out.println("numExistingInstancesPerReplicaGroup " + numExistingInstancesPerReplicaGroup); + System.out.println(""); + for (int i = 0; i < numTargetReplicaGroups; i++) { + System.out.println("Preconfigured instances for replica group " + i + " : " + preConfigured.getInstances(0, i)); + } + System.out.println(""); + for (int i = 0; i < numExistingReplicaGroups; i++) { + System.out.println("Existing instances for replica group " + i + " : " + existing.getInstances(0, i)); + } + System.out.println(""); + InstancePartitions instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existing, preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numTargetReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + + for (int i = 0; i < numTargetReplicaGroups; i++) { + System.out.println("Assigned instances for replica group " + i + " : " + instancePartitions.getInstances(0, i)); + } + } + } + + @Test + public void testPreConfigurationBased() { + LogManager.getLogger(PreConfiguredInstancePartitionSelector.class) + .setLevel(Level.INFO); + + // Test initial assignment 3 replica groups, 7 instances per rg. + int numPartitions = 0; + int numInstancesPerPartition = 0; + int numInstances = 21; + int numPools = 5; + int numReplicaGroups = 3; + int numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + List instanceConfigs = new ArrayList<>(numInstances); + for (int i = 0; i < 100; i++) { + int pool = i % numPools; + InstanceConfig instanceConfig = + new InstanceConfig(SERVER_INSTANCE_ID_PREFIX + i); + instanceConfig.addTag(OFFLINE_TAG); + instanceConfig.getRecord() + .setMapField(InstanceUtils.POOL_KEY, Collections.singletonMap(OFFLINE_TAG, Integer.toString(pool))); + instanceConfigs.add(instanceConfig); + } + InstanceTagPoolConfig tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + InstanceReplicaGroupPartitionConfig replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + InstanceAssignmentDriver driver = new InstanceAssignmentDriver(tableConfig); + InstancePartitions preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 15, + SERVER_INSTANCE_ID_PREFIX + 18)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 13, SERVER_INSTANCE_ID_PREFIX + 16, + SERVER_INSTANCE_ID_PREFIX + 19)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 14, SERVER_INSTANCE_ID_PREFIX + 17, + SERVER_INSTANCE_ID_PREFIX + 20)); + + InstancePartitions instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, null, preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + /* + * Pre-configured partitioning: + * RG1 RG2 RG3 + * Host 0 1 2 + * Host 3 4 5 + * Host 6 7 8 + * Host 9 10 11 + * Host 12 13 14 + * Host 15 16 17 + * Host 18 19 20 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 6 7 8 + * Host 12 13 14 + * Host 15 16 17 + * Host 9 10 11 + * Host 18 19 20 + * Host 3 4 5 + * Host 0 1 2 + */ + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 12, + SERVER_INSTANCE_ID_PREFIX + 15, + SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 18, + SERVER_INSTANCE_ID_PREFIX + 3, + SERVER_INSTANCE_ID_PREFIX + 0)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 13, + SERVER_INSTANCE_ID_PREFIX + 16, + SERVER_INSTANCE_ID_PREFIX + 10, + SERVER_INSTANCE_ID_PREFIX + 19, + SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 1)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 14, + SERVER_INSTANCE_ID_PREFIX + 17, + SERVER_INSTANCE_ID_PREFIX + 11, + SERVER_INSTANCE_ID_PREFIX + 20, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 2)); + + // Test instance shuffling/uplifting from 3*5 to 3*7 + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 21; + numReplicaGroups = 3; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 15, + SERVER_INSTANCE_ID_PREFIX + 18)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 13, SERVER_INSTANCE_ID_PREFIX + 16, + SERVER_INSTANCE_ID_PREFIX + 19)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 14, SERVER_INSTANCE_ID_PREFIX + 17, + SERVER_INSTANCE_ID_PREFIX + 20)); + + InstancePartitions existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 9)); + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 7, SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 13, SERVER_INSTANCE_ID_PREFIX + 10)); + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 14, SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, SERVER_INSTANCE_ID_PREFIX + 11)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, + existingInstancePartitions, preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + /* + * uplift from 15 instances in 3 replicas to 21 instance in 3 replicas + * 21 instances in 4 pools + * Pre-configured partitioning: + * RG1 RG2 RG3 + * Host 0 1 2 + * Host 3 4 5 + * Host 6 7 8 + * Host 9 10 11 + * Host 12 13 14 + * Host 15 16 17 + * Host 18 19 20 + * + * Existing configured partitioning: + * RG1 RG2 RG3 + * Host 0 6 2 + * Host 12 7 14 + * Host 1 4 5 + * Host 3 13 8 + * Host 9 10 11 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 0 1 2 + * Host 12 13 14 + * Host 3 4 5 + * Host 6 7 8 + * Host 9 10 11 + * Host 15 16 17 + * Host 18 19 20 + */ + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, + SERVER_INSTANCE_ID_PREFIX + 12, + SERVER_INSTANCE_ID_PREFIX + 3, + SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 15, + SERVER_INSTANCE_ID_PREFIX + 18)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 13, + SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, + SERVER_INSTANCE_ID_PREFIX + 16, + SERVER_INSTANCE_ID_PREFIX + 19)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 14, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, + SERVER_INSTANCE_ID_PREFIX + 17, + SERVER_INSTANCE_ID_PREFIX + 20)); + + // Test instance replacement from 3*6 to 3*5 + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 15; + numReplicaGroups = 3; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, SERVER_INSTANCE_ID_PREFIX + 21, SERVER_INSTANCE_ID_PREFIX + 24, + SERVER_INSTANCE_ID_PREFIX + 27, SERVER_INSTANCE_ID_PREFIX + 30)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 19, SERVER_INSTANCE_ID_PREFIX + 22, SERVER_INSTANCE_ID_PREFIX + 25, + SERVER_INSTANCE_ID_PREFIX + 28, SERVER_INSTANCE_ID_PREFIX + 31)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 20, SERVER_INSTANCE_ID_PREFIX + 23, SERVER_INSTANCE_ID_PREFIX + 26, + SERVER_INSTANCE_ID_PREFIX + 29, SERVER_INSTANCE_ID_PREFIX + 32)); + + existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, SERVER_INSTANCE_ID_PREFIX + 15, SERVER_INSTANCE_ID_PREFIX + 18)); + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 16, SERVER_INSTANCE_ID_PREFIX + 19)); + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 17, SERVER_INSTANCE_ID_PREFIX + 20)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existingInstancePartitions, + preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + /* + * From 18 instances in 3 replicas to 15 instance in 3 replicas + * Pre-configured partitioning: + * RG1 RG2 RG3 + * Host 18 19 20 + * Host 21 22 23 + * Host 24 25 26 + * Host 27 28 29 + * Host 30 31 32 + * + * Existing configured partitioning: + * RG1 RG2 RG3 + * Host 0 1 2 + * Host 3 4 5 + * Host 6 7 8 + * Host 9 10 11 + * Host 15 16 17 + * Host 18 19 20 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 18 19 20 + * Host 6 7 8 + * Host 3 4 5 + * Host 15 16 17 + * Host 9 10 11 + * + */ + + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, + SERVER_INSTANCE_ID_PREFIX + 30, + SERVER_INSTANCE_ID_PREFIX + 21, + SERVER_INSTANCE_ID_PREFIX + 24, + SERVER_INSTANCE_ID_PREFIX + 27)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 19, + SERVER_INSTANCE_ID_PREFIX + 31, + SERVER_INSTANCE_ID_PREFIX + 22, + SERVER_INSTANCE_ID_PREFIX + 25, + SERVER_INSTANCE_ID_PREFIX + 28)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 20, + SERVER_INSTANCE_ID_PREFIX + 32, + SERVER_INSTANCE_ID_PREFIX + 23, + SERVER_INSTANCE_ID_PREFIX + 26, + SERVER_INSTANCE_ID_PREFIX + 29)); + + // Test instance shuffling/uplifting from 3*5 to 3*7, with some instance replacement + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 18; + numReplicaGroups = 3; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, SERVER_INSTANCE_ID_PREFIX + 15, SERVER_INSTANCE_ID_PREFIX + 18)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 16, SERVER_INSTANCE_ID_PREFIX + 19)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 17, SERVER_INSTANCE_ID_PREFIX + 20)); + + existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 9)); + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 7, SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 13, SERVER_INSTANCE_ID_PREFIX + 10)); + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 14, SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, SERVER_INSTANCE_ID_PREFIX + 11)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existingInstancePartitions, + preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + /* + * uplift from 15 instances in 3 replicas to 21 instance in 3 replicas + * Pre-configured partitioning: + * RG1 RG2 RG3 + * Host 0 1 2 + * Host 3 4 5 + * Host 6 7 8 + * Host 9 10 11 + * Host 15 16 17 + * Host 18 19 20 + * + * Existing configured partitioning: + * RG1 RG2 RG3 + * Host 0 6 2 + * Host 12 7 14 + * Host 1 4 5 + * Host 3 13 8 + * Host 9 10 11 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 0 1 2 + * Host 6 7 8 + * Host 3 4 5 + * Host 15 16 17 + * Host 9 10 11 + * Host 18 19 20 + */ + + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, + SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 3, + SERVER_INSTANCE_ID_PREFIX + 15, + SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 18)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 16, + SERVER_INSTANCE_ID_PREFIX + 10, + SERVER_INSTANCE_ID_PREFIX + 19)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 17, + SERVER_INSTANCE_ID_PREFIX + 11, + SERVER_INSTANCE_ID_PREFIX + 20)); + + // Test instance shuffling/uplifting from 3*5 to 4*6 + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 24; + numReplicaGroups = 4; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 15)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 13, SERVER_INSTANCE_ID_PREFIX + 16)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 14, SERVER_INSTANCE_ID_PREFIX + 17)); + preConfigured.setInstances(0, 3, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, SERVER_INSTANCE_ID_PREFIX + 19, SERVER_INSTANCE_ID_PREFIX + 20, + SERVER_INSTANCE_ID_PREFIX + 21, SERVER_INSTANCE_ID_PREFIX + 22, SERVER_INSTANCE_ID_PREFIX + 23)); + + existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 9)); + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 7, SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 13, SERVER_INSTANCE_ID_PREFIX + 10)); + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 14, SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, SERVER_INSTANCE_ID_PREFIX + 11)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existingInstancePartitions, + preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + /* + * Test instance shuffling/uplifting from 3*5 to 4*6 + * Pre-configured partitioning: + * RG1 RG2 RG3 RG4 + * Host 0 1 2 18 + * Host 3 4 5 19 + * Host 6 7 8 20 + * Host 9 10 11 21 + * Host 12 13 14 22 + * Host 15 16 17 23 + * + * Existing configured partitioning: + * RG1 RG2 RG3 + * Host 0 6 2 + * Host 12 7 14 + * Host 1 4 5 + * Host 3 13 8 + * Host 9 10 11 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 0 1 2 18 + * Host 12 13 14 22 + * Host 3 4 5 19 + * Host 6 7 8 20 + * Host 9 10 11 21 + * Host 15 16 17 23 + */ + + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, + SERVER_INSTANCE_ID_PREFIX + 12, + SERVER_INSTANCE_ID_PREFIX + 3, + SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 15)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 13, + SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, + SERVER_INSTANCE_ID_PREFIX + 16)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 14, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, + SERVER_INSTANCE_ID_PREFIX + 17)); + assertEquals(instancePartitions.getInstances(0, 3), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, + SERVER_INSTANCE_ID_PREFIX + 22, + SERVER_INSTANCE_ID_PREFIX + 19, + SERVER_INSTANCE_ID_PREFIX + 20, + SERVER_INSTANCE_ID_PREFIX + 21, + SERVER_INSTANCE_ID_PREFIX + 23)); + + // Test instance shuffling/downlifting from 4 * 6 to 3 * 4 with shuffling of instances + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 12; + numReplicaGroups = 3; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 14)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 22, SERVER_INSTANCE_ID_PREFIX + 13, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 17)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, SERVER_INSTANCE_ID_PREFIX + 19, SERVER_INSTANCE_ID_PREFIX + 20, + SERVER_INSTANCE_ID_PREFIX + 21, SERVER_INSTANCE_ID_PREFIX + 23)); + + existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, + SERVER_INSTANCE_ID_PREFIX + 12, + SERVER_INSTANCE_ID_PREFIX + 3, + SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 15)); + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 13, + SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, + SERVER_INSTANCE_ID_PREFIX + 16)); + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 14, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, + SERVER_INSTANCE_ID_PREFIX + 17)); + existingInstancePartitions.setInstances(0, 3, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, + SERVER_INSTANCE_ID_PREFIX + 22, + SERVER_INSTANCE_ID_PREFIX + 19, + SERVER_INSTANCE_ID_PREFIX + 20, + SERVER_INSTANCE_ID_PREFIX + 21, + SERVER_INSTANCE_ID_PREFIX + 23)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existingInstancePartitions, + preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + /* + * Test instance shuffling/downlifting from 4 * 6 to 3 * 4 with shuffling of instances + * Pre-configured partitioning: + * RG2 RG3 RG4 + * Host 1 2 18 + * Host 4 22 19 + * Host 7 13 20 + * Host 10 11 21 + * Host 14 17 23 + * + * Existing configured partitioning: + * RG1 RG2 RG3 RG4 + * Host 0 1 2 18 + * Host 3 4 5 19 + * Host 6 7 8 20 + * Host 9 10 11 21 + * Host 12 13 14 22 + * Host 15 16 17 23 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 1 2 18 + * Host 10 11 21 + * Host 7 13 20 + * Host 14 17 23 + */ + + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 10, + SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 14)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 11, + SERVER_INSTANCE_ID_PREFIX + 13, + SERVER_INSTANCE_ID_PREFIX + 17)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, + SERVER_INSTANCE_ID_PREFIX + 21, + SERVER_INSTANCE_ID_PREFIX + 20, + SERVER_INSTANCE_ID_PREFIX + 23)); + } + @Test public void testPoolBased() { // 10 instances in 2 pools, each with 5 instances diff --git a/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java b/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java index e72d066bc98..170540bda00 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java @@ -85,6 +85,8 @@ public static class Cluster { public static final String UPDATE_USER = "UpdateUser"; public static final String UPDATE_ZNODE = "UpdateZnode"; public static final String UPLOAD_SEGMENT = "UploadSegment"; + public static final String GET_INSTANCE_PARTITION = "GetInstancePartition"; + public static final String UPDATE_INSTANCE_PARTITION = "UpdateInstancePartition"; } // Action names for table diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java index 186f545cea3..70c2e60323a 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java @@ -82,6 +82,7 @@ public InstanceReplicaGroupPartitionConfig getReplicaGroupPartitionConfig() { } public enum PartitionSelector { - FD_AWARE_INSTANCE_PARTITION_SELECTOR, INSTANCE_REPLICA_GROUP_PARTITION_SELECTOR + FD_AWARE_INSTANCE_PARTITION_SELECTOR, INSTANCE_REPLICA_GROUP_PARTITION_SELECTOR, + PRE_CONFIGURATION_BASED_PARTITION_SELECTOR } } From 1c66e5f9c0666ccb1ec0c0246e002ffc13eb360c Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 19 Oct 2023 16:13:30 -0700 Subject: [PATCH 02/19] checkstyle --- .../controller/api/resources/PinotTenantRestletResource.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java index 16faa3ea0d1..9cd11f63fcf 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java @@ -301,7 +301,8 @@ public String getTablesOnTenant( @ApiResponse(code = 500, message = "Instance partitions not found")}) public InstancePartitions getInstancePartitions( @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, - @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) @QueryParam("serverType") String serverType) { + @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) @QueryParam("serverType") String serverType) + { String tenantNameWithType = InstancePartitionsType.valueOf(serverType).getInstancePartitionsName(tenantName); InstancePartitions instancePartitions = InstancePartitionsUtils.fetchInstancePartitions(_pinotHelixResourceManager.getPropertyStore(), From 19282cebbf77878febe24c6ed31eaf5059b7a842 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 19 Oct 2023 16:14:30 -0700 Subject: [PATCH 03/19] checkstyle --- .../controller/api/resources/PinotTenantRestletResource.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java index 9cd11f63fcf..4f1320a4507 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java @@ -301,8 +301,8 @@ public String getTablesOnTenant( @ApiResponse(code = 500, message = "Instance partitions not found")}) public InstancePartitions getInstancePartitions( @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, - @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) @QueryParam("serverType") String serverType) - { + @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) + @QueryParam("serverType") String serverType) { String tenantNameWithType = InstancePartitionsType.valueOf(serverType).getInstancePartitionsName(tenantName); InstancePartitions instancePartitions = InstancePartitionsUtils.fetchInstancePartitions(_pinotHelixResourceManager.getPropertyStore(), From f77e90bc4d66d2c9715a265438a068225abdd89c Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 19 Oct 2023 16:41:33 -0700 Subject: [PATCH 04/19] checkstyle --- .../api/resources/PinotTenantRestletResource.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java index 4f1320a4507..af17153a74b 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java @@ -301,9 +301,10 @@ public String getTablesOnTenant( @ApiResponse(code = 500, message = "Instance partitions not found")}) public InstancePartitions getInstancePartitions( @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, - @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) - @QueryParam("serverType") String serverType) { - String tenantNameWithType = InstancePartitionsType.valueOf(serverType).getInstancePartitionsName(tenantName); + @ApiParam(value = "instancePartitionType (OFFLINE|CONSUMING|COMPLETED)", required = true) + @QueryParam("instancePartitionType") String instancePartitionType) { + String tenantNameWithType = InstancePartitionsType.valueOf(instancePartitionType) + .getInstancePartitionsName(tenantName); InstancePartitions instancePartitions = InstancePartitionsUtils.fetchInstancePartitions(_pinotHelixResourceManager.getPropertyStore(), tenantNameWithType); @@ -327,7 +328,8 @@ public InstancePartitions getInstancePartitions( @ApiResponse(code = 500, message = "Failed to update the tenant")}) public InstancePartitions assignInstancesPartitionMap( @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, - @ApiParam(value = "Server type (OFFLINE|REALTIME)", required = true) @QueryParam("serverType") String serverType, + @ApiParam(value = "instancePartitionType (OFFLINE|CONSUMING|COMPLETED)", required = true) + @QueryParam("instancePartitionType") String instancePartitionType, String instancePartitionsStr) { InstancePartitions instancePartitions; try { @@ -337,7 +339,8 @@ public InstancePartitions assignInstancesPartitionMap( Response.Status.BAD_REQUEST); } - String tenantNameWithType = InstancePartitionsType.valueOf(serverType).getInstancePartitionsName(tenantName); + String tenantNameWithType = InstancePartitionsType.valueOf(instancePartitionType) + .getInstancePartitionsName(tenantName); Preconditions.checkState(instancePartitions.getInstancePartitionsName().equals(tenantNameWithType), "Instance partitions name mismatch, expected: %s, got: %s", tenantNameWithType, instancePartitions.getInstancePartitionsName()); From 0df73b6c48bbe0246006393593e8565fee3aa1b2 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Mon, 23 Oct 2023 15:46:20 -0700 Subject: [PATCH 05/19] Trigger Test From eb66f9f17b8964d5fcdf34da236fe4c2e989f2e9 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 24 Oct 2023 15:25:36 -0700 Subject: [PATCH 06/19] Address comments --- .../InstanceAssignmentConfigUtils.java | 2 +- .../PinotInstanceAssignmentRestletResource.java | 4 ++++ .../resources/PinotTenantRestletResource.java | 4 ++-- .../InstancePartitionSelectorFactory.java | 4 ++-- ...irrorServerSetInstancePartitionSelector.java} | 6 +++--- .../instance/InstanceAssignmentTest.java | 16 ++++++++-------- .../java/org/apache/pinot/core/auth/Actions.java | 4 ++-- .../assignment/InstanceAssignmentConfig.java | 2 +- 8 files changed, 23 insertions(+), 19 deletions(-) rename pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/{PreConfiguredInstancePartitionSelector.java => MirrorServerSetInstancePartitionSelector.java} (98%) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java index cd056f2ca2e..d3aca92aa00 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java @@ -132,6 +132,6 @@ public static boolean isPreConfigurationBasedAssignment(TableConfig tableConfig, return tableConfig.getInstanceAssignmentConfigMap().get(instancePartitionsType.toString()) != null && InstanceAssignmentConfigUtils.getInstanceAssignmentConfig(tableConfig, instancePartitionsType) .getPartitionSelector() - == InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR; + == InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR; } } diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java index 38d3af7cbde..71c944188a6 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java @@ -252,10 +252,14 @@ private void assignInstancesForInstancePartitionsType(Map> _existingMirroredServerLists = new ArrayList<>(); - public PreConfiguredInstancePartitionSelector(InstanceReplicaGroupPartitionConfig replicaGroupPartitionConfig, + public MirrorServerSetInstancePartitionSelector(InstanceReplicaGroupPartitionConfig replicaGroupPartitionConfig, String tableNameWithType, InstancePartitions existingInstancePartitions, InstancePartitions preConfiguredInstancePartitions) { super(replicaGroupPartitionConfig, tableNameWithType, existingInstancePartitions); diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java index 5b1007ce996..e3e05316e78 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java @@ -374,7 +374,7 @@ public void testPreConfigurationBasedRandomInner(int loopCount) throws FileNotFo TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, - InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")) .build(); InstanceAssignmentDriver driver = new InstanceAssignmentDriver(tableConfig); @@ -452,7 +452,7 @@ public void testPreConfigurationBasedRandomInner(int loopCount) throws FileNotFo @Test public void testPreConfigurationBased() { - LogManager.getLogger(PreConfiguredInstancePartitionSelector.class) + LogManager.getLogger(MirrorServerSetInstancePartitionSelector.class) .setLevel(Level.INFO); // Test initial assignment 3 replica groups, 7 instances per rg. @@ -480,7 +480,7 @@ public void testPreConfigurationBased() { TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, - InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); InstanceAssignmentDriver driver = new InstanceAssignmentDriver(tableConfig); InstancePartitions preConfigured = new InstancePartitions("preConfigured"); @@ -561,7 +561,7 @@ public void testPreConfigurationBased() { tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, - InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); driver = new InstanceAssignmentDriver(tableConfig); preConfigured = new InstancePartitions("preConfigured"); @@ -664,7 +664,7 @@ public void testPreConfigurationBased() { tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, - InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); driver = new InstanceAssignmentDriver(tableConfig); preConfigured = new InstancePartitions("preConfigured"); @@ -756,7 +756,7 @@ public void testPreConfigurationBased() { tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, - InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); driver = new InstanceAssignmentDriver(tableConfig); preConfigured = new InstancePartitions("preConfigured"); @@ -851,7 +851,7 @@ public void testPreConfigurationBased() { tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, - InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); driver = new InstanceAssignmentDriver(tableConfig); preConfigured = new InstancePartitions("preConfigured"); @@ -956,7 +956,7 @@ public void testPreConfigurationBased() { tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, - InstanceAssignmentConfig.PartitionSelector.PRE_CONFIGURATION_BASED_PARTITION_SELECTOR.toString()))) + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); driver = new InstanceAssignmentDriver(tableConfig); preConfigured = new InstancePartitions("preConfigured"); diff --git a/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java b/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java index 170540bda00..7c9ea081d4a 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/auth/Actions.java @@ -85,8 +85,8 @@ public static class Cluster { public static final String UPDATE_USER = "UpdateUser"; public static final String UPDATE_ZNODE = "UpdateZnode"; public static final String UPLOAD_SEGMENT = "UploadSegment"; - public static final String GET_INSTANCE_PARTITION = "GetInstancePartition"; - public static final String UPDATE_INSTANCE_PARTITION = "UpdateInstancePartition"; + public static final String GET_INSTANCE_PARTITIONS = "GetInstancePartitions"; + public static final String UPDATE_INSTANCE_PARTITIONS = "UpdateInstancePartitions"; } // Action names for table diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java index 70c2e60323a..391ba4812d3 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/assignment/InstanceAssignmentConfig.java @@ -83,6 +83,6 @@ public InstanceReplicaGroupPartitionConfig getReplicaGroupPartitionConfig() { public enum PartitionSelector { FD_AWARE_INSTANCE_PARTITION_SELECTOR, INSTANCE_REPLICA_GROUP_PARTITION_SELECTOR, - PRE_CONFIGURATION_BASED_PARTITION_SELECTOR + MIRROR_SERVER_SET_PARTITION_SELECTOR } } From b4cccad25363bc563e9c805e7f7a62cc790c003c Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 24 Oct 2023 18:26:21 -0700 Subject: [PATCH 07/19] refactored branches --- ...inotInstanceAssignmentRestletResource.java | 44 +++++++++-------- .../helix/core/PinotHelixResourceManager.java | 30 ++++++------ .../helix/core/rebalance/TableRebalancer.java | 49 ++++++++++--------- 3 files changed, 62 insertions(+), 61 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java index 71c944188a6..e4ba5ba10b0 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java @@ -251,28 +251,30 @@ private void assignInstancesForInstancePartitionsType(Map getInstancePartitions(TableConfig tabl InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), instancePartitions); } - } else if (isPreConfigurationBasedAssignment) { - String referenceInstancePartitionsName = tableConfig.getInstancePartitionsMap().get(instancePartitionsType); - InstancePartitions preConfiguredInstancePartitions = - InstancePartitionsUtils.fetchInstancePartitionsWithRename(_helixManager.getHelixPropertyStore(), - referenceInstancePartitionsName, instancePartitionsName); - instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, - _helixDataAccessor.getChildValues(_helixDataAccessor.keyBuilder().instanceConfigs(), true), - bootstrap ? null : existingInstancePartitions, preConfiguredInstancePartitions); - instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); - if (!dryRun && !instancePartitionsUnchanged) { - LOGGER.info("Persisting instance partitions: {} (based on {})", instancePartitions, - preConfiguredInstancePartitions); - InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), - instancePartitions); - } } else { String referenceInstancePartitionsName = tableConfig.getInstancePartitionsMap().get(instancePartitionsType); - instancePartitions = - InstancePartitionsUtils.fetchInstancePartitionsWithRename(_helixManager.getHelixPropertyStore(), - referenceInstancePartitionsName, instancePartitionsName); - instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); - if (!dryRun && !instancePartitionsUnchanged) { - LOGGER.info("Persisting instance partitions: {} (referencing {})", instancePartitions, - referenceInstancePartitionsName); - InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), - instancePartitions); + if (isPreConfigurationBasedAssignment) { + InstancePartitions preConfiguredInstancePartitions = + InstancePartitionsUtils.fetchInstancePartitionsWithRename(_helixManager.getHelixPropertyStore(), + referenceInstancePartitionsName, instancePartitionsName); + instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, + _helixDataAccessor.getChildValues(_helixDataAccessor.keyBuilder().instanceConfigs(), true), + bootstrap ? null : existingInstancePartitions, preConfiguredInstancePartitions); + instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); + if (!dryRun && !instancePartitionsUnchanged) { + LOGGER.info("Persisting instance partitions: {} (based on {})", instancePartitions, + preConfiguredInstancePartitions); + InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), + instancePartitions); + } + } else { + instancePartitions = + InstancePartitionsUtils.fetchInstancePartitionsWithRename(_helixManager.getHelixPropertyStore(), + referenceInstancePartitionsName, instancePartitionsName); + instancePartitionsUnchanged = instancePartitions.equals(existingInstancePartitions); + if (!dryRun && !instancePartitionsUnchanged) { + LOGGER.info("Persisting instance partitions: {} (referencing {})", instancePartitions, + referenceInstancePartitionsName); + InstancePartitionsUtils.persistInstancePartitions(_helixManager.getHelixPropertyStore(), + instancePartitions); + } } } return Pair.of(instancePartitions, instancePartitionsUnchanged); From 37d7fe1357c77412796a0a107ea6246e38a5149c Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 24 Oct 2023 18:45:18 -0700 Subject: [PATCH 08/19] address comments --- ...rorServerSetInstancePartitionSelector.java | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java index 7e2009b552f..98ba77f55f9 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java @@ -31,6 +31,7 @@ import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; +import javax.annotation.Nullable; import org.apache.helix.model.InstanceConfig; import org.apache.pinot.common.assignment.InstancePartitions; import org.apache.pinot.spi.config.table.assignment.InstanceReplicaGroupPartitionConfig; @@ -38,6 +39,21 @@ import org.slf4j.LoggerFactory; +/** + * Detailed design see https://docs.google.com/document/d/1xxPkGPxyY21gAkFi9gtFDeSzEXjPjp-IQW70kHynsL8 + * During each creation/update/scale, the algorithm will refer to the corresponding tenant level instance partitions and + * generate an instance partition by taking numInstancePerReplicaGroup mirror server sets from the tenant level + * instance partitions. + * + * If an existingInstancePartition is provided, the algorithm will generate a best effort assignment that resembles + * the existingInstancePartition. + * + * Assumptions for this algorithm: + * 1. The number of replica groups in the tenant level instance partitions is the same as the number of replica groups + * in the table config. + * 2. The number of partitions at replica group level is 1 + * 3. This algorithm only works for replica group based table assignment + */ public class MirrorServerSetInstancePartitionSelector extends InstancePartitionSelector { private static final Logger LOGGER = LoggerFactory.getLogger(MirrorServerSetInstancePartitionSelector.class); private final InstancePartitions _preConfiguredInstancePartitions; @@ -62,7 +78,7 @@ public class MirrorServerSetInstancePartitionSelector extends InstancePartitionS private final List> _existingMirroredServerLists = new ArrayList<>(); public MirrorServerSetInstancePartitionSelector(InstanceReplicaGroupPartitionConfig replicaGroupPartitionConfig, - String tableNameWithType, InstancePartitions existingInstancePartitions, + String tableNameWithType, @Nullable InstancePartitions existingInstancePartitions, InstancePartitions preConfiguredInstancePartitions) { super(replicaGroupPartitionConfig, tableNameWithType, existingInstancePartitions); _preConfiguredInstancePartitions = preConfiguredInstancePartitions; @@ -172,7 +188,7 @@ void createLookupTablesFromPreConfiguredInstanceAssignmentMap() { } @Override - void selectInstances(Map> poolToInstanceConfigsMap, + public void selectInstances(Map> poolToInstanceConfigsMap, InstancePartitions instancePartitions) { if (_replicaGroupPartitionConfig.isReplicaGroupBased()) { validatePoolDiversePreconditions(poolToInstanceConfigsMap); From e8587c5f3217840fde313498f418cbe1f00f511f Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 24 Oct 2023 19:37:25 -0700 Subject: [PATCH 09/19] address comments --- .../resources/PinotTenantRestletResource.java | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java index 383b1d1a186..f856421b52b 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java @@ -21,7 +21,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.node.ObjectNode; -import com.google.common.base.Preconditions; import io.swagger.annotations.Api; import io.swagger.annotations.ApiKeyAuthDefinition; import io.swagger.annotations.ApiOperation; @@ -297,11 +296,12 @@ public String getTablesOnTenant( @Authenticate(AccessType.READ) @Produces(MediaType.APPLICATION_JSON) @ApiOperation(value = "Get the instance partitions of a tenant") - @ApiResponses(value = {@ApiResponse(code = 200, message = "Success"), - @ApiResponse(code = 500, message = "Instance partitions not found")}) + @ApiResponses(value = {@ApiResponse(code = 200, message = "Success", response = InstancePartitions.class), + @ApiResponse(code = 404, message = "Instance partitions not found")}) public InstancePartitions getInstancePartitions( @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, - @ApiParam(value = "instancePartitionType (OFFLINE|CONSUMING|COMPLETED)", required = true) + @ApiParam(value = "instancePartitionType (OFFLINE|CONSUMING|COMPLETED)", required = true, + allowableValues = "OFFLINE, CONSUMING, COMPLETED") @QueryParam("instancePartitionType") String instancePartitionType) { String tenantNameWithType = InstancePartitionsType.valueOf(instancePartitionType) .getInstancePartitionsName(tenantName); @@ -324,11 +324,12 @@ public InstancePartitions getInstancePartitions( @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @ApiOperation(value = "Update an instance partition for a server type in a tenant") - @ApiResponses(value = {@ApiResponse(code = 200, message = "Success"), - @ApiResponse(code = 500, message = "Failed to update the tenant")}) + @ApiResponses(value = {@ApiResponse(code = 200, message = "Success", response = InstancePartitions.class), + @ApiResponse(code = 400, message = "Failed to update the tenant")}) public InstancePartitions assignInstancesPartitionMap( @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, - @ApiParam(value = "instancePartitionType (OFFLINE|CONSUMING|COMPLETED)", required = true) + @ApiParam(value = "instancePartitionType (OFFLINE|CONSUMING|COMPLETED)", required = true, + allowableValues = "OFFLINE, CONSUMING, COMPLETED") @QueryParam("instancePartitionType") String instancePartitionType, String instancePartitionsStr) { InstancePartitions instancePartitions; @@ -339,11 +340,14 @@ public InstancePartitions assignInstancesPartitionMap( Response.Status.BAD_REQUEST); } - String tenantNameWithType = InstancePartitionsType.valueOf(instancePartitionType) + String tenantLevelInstancePartitionMap = InstancePartitionsType.valueOf(instancePartitionType) .getInstancePartitionsName(tenantName); - Preconditions.checkState(instancePartitions.getInstancePartitionsName().equals(tenantNameWithType), - "Instance partitions name mismatch, expected: %s, got: %s", tenantNameWithType, - instancePartitions.getInstancePartitionsName()); + + if (!instancePartitions.getInstancePartitionsName().equals(tenantLevelInstancePartitionMap)) { + throw new ControllerApplicationException(LOGGER, "Instance partitions name mismatch, expected: " + + tenantLevelInstancePartitionMap + + ", got: " + instancePartitions.getInstancePartitionsName(), Response.Status.BAD_REQUEST); + } persistInstancePartitionsHelper(instancePartitions); return instancePartitions; From 764c8fcad08968db6ab9e5d95ffa06b821ec2ff4 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 24 Oct 2023 19:57:48 -0700 Subject: [PATCH 10/19] address comments --- ...irrorServerSetInstancePartitionSelector.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java index 98ba77f55f9..5c98c5cd25c 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java @@ -97,16 +97,21 @@ private void validatePoolDiversePreconditions(Map> // numTargetInstancesPerReplica should be positive Preconditions.checkState(_numTargetInstancesPerReplicaGroup > 0, "Number of instances per replica must be positive"); + LOGGER.info("Number of instances per replica: {}", _numTargetInstancesPerReplicaGroup); // _numTargetReplicaGroups should be positive Preconditions.checkState(_numTargetReplicaGroups > 0, "Number of replica-groups must be positive"); + LOGGER.info("Number of replica-groups: {}", _numTargetReplicaGroups); // validate target partition count is 1 Preconditions.checkState(_replicaGroupPartitionConfig.getNumPartitions() <= 1, "This algorithm does not support table level partitioning for target assignment"); + LOGGER.info("Number of partitions: {}", _replicaGroupPartitionConfig.getNumPartitions()); // Validate the existing instance partitions is null or has only one partition Preconditions.checkState( (_existingInstancePartitions == null || _existingInstancePartitions.getNumPartitions() == 1), "This algorithm does not support table level partitioning for existing assignment"); + LOGGER.info("Number of partitions in existing instance partitions: {}", _existingInstancePartitions == null ? 0 + : _existingInstancePartitions.getNumPartitions()); _numExistingReplicaGroups = _existingInstancePartitions == null ? 0 : _existingInstancePartitions.getNumReplicaGroups(); @@ -118,6 +123,8 @@ private void validatePoolDiversePreconditions(Map> "Pre-configured instance partitions must be provided for pre-configuration based selection"); Preconditions.checkState(_preConfiguredInstancePartitions.getNumPartitions() == 1, "This algorithm does not support table level partitioning for pre-configured assignment"); + LOGGER.info("Number of partitions in pre-configured instance partitions: {}", _preConfiguredInstancePartitions + .getNumPartitions()); // Validate the number of replica-groups in the pre-configured instance partitions is equal to the target // number of replica-groups @@ -126,12 +133,16 @@ private void validatePoolDiversePreconditions(Map> "The number of replica-groups %s in the pre-configured instance partitions " + "is not equal to the target number of replica-groups %s", _numPreConfiguredReplicaGroups, _numTargetReplicaGroups); + LOGGER.info("Number of replica-groups in pre-configured instance partitions: {}", _numPreConfiguredReplicaGroups); + // Validate the number of instances per replica-group in the pre-configured instance partitions is greater than or // equal to the target number of instances per replica-group _numPreConfiguredInstancesPerReplicaGroup = _preConfiguredInstancePartitions.getInstances(0, 0).size(); Preconditions.checkState(_numPreConfiguredInstancesPerReplicaGroup >= _numTargetInstancesPerReplicaGroup, "The number of instances per replica-group in the pre-configured " + "instance partitions is less than the target number of instances per replica-group"); + LOGGER.info("Number of instances per replica-group in pre-configured instance partitions: {}", + _numPreConfiguredInstancesPerReplicaGroup); // Validate the pool to instance configs map is not null or empty Preconditions.checkNotNull(poolToInstanceConfigsMap, "poolToInstanceConfigsMap is null"); @@ -143,12 +154,16 @@ private void validatePoolDiversePreconditions(Map> HashSet availableInstanceSet = new HashSet<>(); poolToInstanceConfigsMap.values().forEach(list -> list.forEach(i -> availableInstanceSet.add(i.getInstanceName()))); + LOGGER.info("Number of pools: {}", numPools); + LOGGER.info("Number of instances in all pools: {}", availableInstanceSet.size()); + LOGGER.info("availableInstanceSet: {}", availableInstanceSet); for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { List instances = _preConfiguredInstancePartitions.getInstances(0, i); for (String instance : instances) { Preconditions.checkState(availableInstanceSet.contains(instance), - "Instance %s in pre-configured instance " + "partitions is not in the pool to instance configs map", + "Instance %s in pre-configured instance partitions is not in " + + "the pool to instance configs map", instance); } } From 9b5c4a9ade0ce907e1e2ef73b7821563519d1764 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Wed, 25 Oct 2023 00:13:14 -0700 Subject: [PATCH 11/19] Change the shuffling logic to be compatible with future changes --- ...rorServerSetInstancePartitionSelector.java | 30 ++++++++-- .../instance/InstanceAssignmentTest.java | 56 +++++++++---------- 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java index 5c98c5cd25c..998fcea041e 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java @@ -22,6 +22,7 @@ import java.util.AbstractMap; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -217,16 +218,28 @@ public void selectInstances(Map> poolToInstanceCon createListFromPreConfiguredInstanceAssignmentMap(); // shuffle the list of lists of mirrored servers based on the table name hash int tableNameHash = Math.abs(_tableNameWithType.hashCode()); - Collections.shuffle(_preConfiguredMirroredServerLists, new Random(tableNameHash)); - - // create the instance partitions based on the rotated list of mirrored servers + // initialize a list of indices from 0 to _numPreConfiguredInstancesPerReplicaGroup + List shuffledIndex = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); + for (int i = 0; i < _numPreConfiguredInstancesPerReplicaGroup; i++) { + shuffledIndex.add(i); + } + // shuffle the list of indices based on the table name hash + Collections.shuffle(shuffledIndex, new Random(tableNameHash)); + // select the first _numTargetInstancesPerReplicaGroup indices + shuffledIndex = shuffledIndex.subList(0, _numTargetInstancesPerReplicaGroup); + // sort the list of indices so that they follow the original order of the pre-configured instance partitions + shuffledIndex.sort(Comparator.naturalOrder()); + + // create the instance partitions based on the shuffled list of mirrored servers List> resultReplicaGroups = new ArrayList<>(_numTargetReplicaGroups); for (int i = 0; i < _numTargetReplicaGroups; i++) { resultReplicaGroups.add(new ArrayList<>(_numTargetInstancesPerReplicaGroup)); } + + // populate the instance partitions with the selected mirrored servers for (int j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { for (int i = 0; i < _numTargetReplicaGroups; i++) { - resultReplicaGroups.get(i).add(_preConfiguredMirroredServerLists.get(j).get(i)); + resultReplicaGroups.get(i).add(_preConfiguredMirroredServerLists.get(shuffledIndex.get(j)).get(i)); } } for (int i = 0; i < _numTargetReplicaGroups; i++) { @@ -246,7 +259,7 @@ public void selectInstances(Map> poolToInstanceCon Map> existingOffsetToResultTuple = new HashMap<>(); // For each instance offset, find the mirrored server that is most similar to the existing mirrored server - // set. If the mirrored server is not used, add it to the result list. + // set. If this mirrored server is not used, add it to the result list. for (int j = 0; j < _numExistingInstancesPerReplicaGroup; j++) { List existingMirroredServers = _existingMirroredServerLists.get(j); int finalJ = j; @@ -284,7 +297,12 @@ public void selectInstances(Map> poolToInstanceCon for (int j = 0; j < _numPreConfiguredInstancesPerReplicaGroup; j++) { shuffledOffsets.add(j); } - Collections.shuffle(shuffledOffsets, new Random(Math.abs(_tableNameWithType.hashCode()))); + // Commenting this out as + // (1) Shuffling is already done in the initial step. + // (2) We want to keep the order of the pre-configured instance partitions, so that the segment assignment + // strategy for single tenant cluster can be minimized-impact. + // But keeping the code here in case we want to have a specific reordering strategy in the future. + // Collections.shuffle(shuffledOffsets, new Random(Math.abs(_tableNameWithType.hashCode()))); for (int k = 0, j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { if (existingOffsetToResultTuple.containsKey(j)) { continue; diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java index e3e05316e78..93a882883f6 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java @@ -514,38 +514,38 @@ public void testPreConfigurationBased() { * * Final assignment for this table: * RG1 RG2 RG3 + * Host 0 1 2 + * Host 3 4 5 * Host 6 7 8 + * Host 9 10 11 * Host 12 13 14 * Host 15 16 17 - * Host 9 10 11 * Host 18 19 20 - * Host 3 4 5 - * Host 0 1 2 */ assertEquals(instancePartitions.getInstances(0, 0), - Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 6, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 0, + SERVER_INSTANCE_ID_PREFIX + 3, + SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 15, - SERVER_INSTANCE_ID_PREFIX + 9, - SERVER_INSTANCE_ID_PREFIX + 18, - SERVER_INSTANCE_ID_PREFIX + 3, - SERVER_INSTANCE_ID_PREFIX + 0)); + SERVER_INSTANCE_ID_PREFIX + 18)); assertEquals(instancePartitions.getInstances(0, 1), - Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 7, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 13, SERVER_INSTANCE_ID_PREFIX + 16, - SERVER_INSTANCE_ID_PREFIX + 10, - SERVER_INSTANCE_ID_PREFIX + 19, - SERVER_INSTANCE_ID_PREFIX + 4, - SERVER_INSTANCE_ID_PREFIX + 1)); + SERVER_INSTANCE_ID_PREFIX + 19)); assertEquals(instancePartitions.getInstances(0, 2), - Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 8, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 14, SERVER_INSTANCE_ID_PREFIX + 17, - SERVER_INSTANCE_ID_PREFIX + 11, - SERVER_INSTANCE_ID_PREFIX + 20, - SERVER_INSTANCE_ID_PREFIX + 5, - SERVER_INSTANCE_ID_PREFIX + 2)); + SERVER_INSTANCE_ID_PREFIX + 20)); // Test instance shuffling/uplifting from 3*5 to 3*7 numPartitions = 0; @@ -716,31 +716,31 @@ public void testPreConfigurationBased() { * Final assignment for this table: * RG1 RG2 RG3 * Host 18 19 20 - * Host 6 7 8 - * Host 3 4 5 - * Host 15 16 17 - * Host 9 10 11 + * Host 21 22 23 + * Host 24 25 26 + * Host 27 28 29 + * Host 30 31 32 * */ assertEquals(instancePartitions.getInstances(0, 0), Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 18, - SERVER_INSTANCE_ID_PREFIX + 30, SERVER_INSTANCE_ID_PREFIX + 21, SERVER_INSTANCE_ID_PREFIX + 24, - SERVER_INSTANCE_ID_PREFIX + 27)); + SERVER_INSTANCE_ID_PREFIX + 27, + SERVER_INSTANCE_ID_PREFIX + 30)); assertEquals(instancePartitions.getInstances(0, 1), Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 19, - SERVER_INSTANCE_ID_PREFIX + 31, SERVER_INSTANCE_ID_PREFIX + 22, SERVER_INSTANCE_ID_PREFIX + 25, - SERVER_INSTANCE_ID_PREFIX + 28)); + SERVER_INSTANCE_ID_PREFIX + 28, + SERVER_INSTANCE_ID_PREFIX + 31)); assertEquals(instancePartitions.getInstances(0, 2), Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 20, - SERVER_INSTANCE_ID_PREFIX + 32, SERVER_INSTANCE_ID_PREFIX + 23, SERVER_INSTANCE_ID_PREFIX + 26, - SERVER_INSTANCE_ID_PREFIX + 29)); + SERVER_INSTANCE_ID_PREFIX + 29, + SERVER_INSTANCE_ID_PREFIX + 32)); // Test instance shuffling/uplifting from 3*5 to 3*7, with some instance replacement numPartitions = 0; From ad941aa6f1b9931020c585338170d90920a0c0a3 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Wed, 25 Oct 2023 01:42:09 -0700 Subject: [PATCH 12/19] add test cases --- .../instance/InstanceAssignmentTest.java | 238 ++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java index 93a882883f6..84e92a86354 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java @@ -1047,6 +1047,244 @@ public void testPreConfigurationBased() { SERVER_INSTANCE_ID_PREFIX + 21, SERVER_INSTANCE_ID_PREFIX + 20, SERVER_INSTANCE_ID_PREFIX + 23)); + + + // upscale 3*3 to 3*5 + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 15; + numReplicaGroups = 3; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 13)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 14)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 15)); + + existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 3)); + + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 6)); + + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 9)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existingInstancePartitions, + preConfigured); + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + /* + * Test instance shuffling/downlifting from 4 * 6 to 3 * 4 with shuffling of instances + * Pre-configured partitioning: + * RG2 RG3 RG4 + * Host 1 2 3 + * Host 4 5 6 + * Host 7 8 9 + * Host 10 11 12 + * Host 13 14 15 + * + * Existing configured partitioning: + * RG1 RG2 RG3 + * Host 1 2 3 + * Host 4 5 6 + * Host 7 8 9 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 1 2 3 + * Host 4 5 6 + * Host 7 8 9 + * Host 10 11 12 + * Host 13 14 15 + */ + + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, + SERVER_INSTANCE_ID_PREFIX + 4, + SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, + SERVER_INSTANCE_ID_PREFIX + 13)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, + SERVER_INSTANCE_ID_PREFIX + 5, + SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, + SERVER_INSTANCE_ID_PREFIX + 14)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, + SERVER_INSTANCE_ID_PREFIX + 6, + SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 12, + SERVER_INSTANCE_ID_PREFIX + 15)); + + // downscale 3*5 to 3*3 + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 9; + numReplicaGroups = 3; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 9)); + + existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7, + SERVER_INSTANCE_ID_PREFIX + 10, SERVER_INSTANCE_ID_PREFIX + 13)); + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8, + SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 14)); + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 9, + SERVER_INSTANCE_ID_PREFIX + 12, SERVER_INSTANCE_ID_PREFIX + 15)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existingInstancePartitions, + preConfigured); + + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + + /* + * Test instance shuffling/downlifting from 4 * 6 to 3 * 4 with shuffling of instances + * Pre-configured partitioning: + * RG2 RG3 RG4 + * Host 1 2 3 + * Host 4 5 6 + * Host 7 8 9 + * + * Existing configured partitioning: + * RG1 RG2 RG3 + * Host 1 2 3 + * Host 4 5 6 + * Host 7 8 9 + * Host 10 11 12 + * Host 13 14 15 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 1 2 3 + * Host 4 5 6 + * Host 7 8 9 + */ + + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 9)); + + // replace instance 5 with instance 11 + numPartitions = 0; + numInstancesPerPartition = 0; + numInstances = 9; + numReplicaGroups = 3; + numInstancesPerReplicaGroup = numInstances / numReplicaGroups; + tagPoolConfig = new InstanceTagPoolConfig(OFFLINE_TAG, true, numPools, null); + replicaPartitionConfig = + new InstanceReplicaGroupPartitionConfig(true, 0, numReplicaGroups, numInstancesPerReplicaGroup, numPartitions, + numInstancesPerPartition, false, null); + + tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName(RAW_TABLE_NAME) + .setInstanceAssignmentConfigMap(Collections.singletonMap(InstancePartitionsType.OFFLINE.toString(), + new InstanceAssignmentConfig(tagPoolConfig, null, replicaPartitionConfig, + InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR.toString()))) + .setInstancePartitionsMap(Collections.singletonMap(InstancePartitionsType.OFFLINE, "preConfigured")).build(); + driver = new InstanceAssignmentDriver(tableConfig); + + preConfigured = new InstancePartitions("preConfigured"); + preConfigured.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7)); + preConfigured.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 8)); + preConfigured.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 9)); + + existingInstancePartitions = new InstancePartitions("existing"); + existingInstancePartitions.setInstances(0, 0, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7)); + existingInstancePartitions.setInstances(0, 1, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 5, SERVER_INSTANCE_ID_PREFIX + 8)); + existingInstancePartitions.setInstances(0, 2, + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 9)); + + instancePartitions = + driver.assignInstances(InstancePartitionsType.OFFLINE, instanceConfigs, existingInstancePartitions, + preConfigured); + + assertEquals(instancePartitions.getNumReplicaGroups(), numReplicaGroups); + assertEquals(instancePartitions.getNumPartitions(), 1); + + /* + * Test instance shuffling/downlifting from 4 * 6 to 3 * 4 with shuffling of instances + * Pre-configured partitioning: + * RG2 RG3 RG4 + * Host 1 2 3 + * Host 4 11 6 + * Host 7 8 9 + * + * Existing configured partitioning: + * RG2 RG3 RG4 + * Host 1 2 3 + * Host 4 5 6 + * Host 7 8 9 + * + * Final assignment for this table: + * RG1 RG2 RG3 + * Host 1 2 3 + * Host 4 11 6 + * Host 7 8 9 + */ + + // Verifying the final configuration after downlifting + assertEquals(instancePartitions.getInstances(0, 0), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 1, SERVER_INSTANCE_ID_PREFIX + 4, SERVER_INSTANCE_ID_PREFIX + 7)); + assertEquals(instancePartitions.getInstances(0, 1), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 2, SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 8)); + assertEquals(instancePartitions.getInstances(0, 2), + Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 3, SERVER_INSTANCE_ID_PREFIX + 6, SERVER_INSTANCE_ID_PREFIX + 9)); } @Test From db493491b170222c845d2990b724aeea41f36bd6 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Wed, 25 Oct 2023 09:48:04 -0700 Subject: [PATCH 13/19] address comments --- .../api/resources/PinotTenantRestletResource.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java index f856421b52b..58fa81cf47c 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java @@ -310,7 +310,8 @@ public InstancePartitions getInstancePartitions( tenantNameWithType); if (instancePartitions == null) { - throw new ControllerApplicationException(LOGGER, "Failed to find the instance partitions", + throw new ControllerApplicationException(LOGGER, + String.format("Failed to find the instance partitions for %s", tenantNameWithType), Response.Status.NOT_FOUND); } else { return instancePartitions; @@ -340,12 +341,12 @@ public InstancePartitions assignInstancesPartitionMap( Response.Status.BAD_REQUEST); } - String tenantLevelInstancePartitionMap = InstancePartitionsType.valueOf(instancePartitionType) + String inputTenantName = InstancePartitionsType.valueOf(instancePartitionType) .getInstancePartitionsName(tenantName); - if (!instancePartitions.getInstancePartitionsName().equals(tenantLevelInstancePartitionMap)) { + if (!instancePartitions.getInstancePartitionsName().equals(inputTenantName)) { throw new ControllerApplicationException(LOGGER, "Instance partitions name mismatch, expected: " - + tenantLevelInstancePartitionMap + + inputTenantName + ", got: " + instancePartitions.getInstancePartitionsName(), Response.Status.BAD_REQUEST); } From dab32991a983a221827c84aee5fa9d13ac3739dc Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Wed, 25 Oct 2023 20:07:38 -0700 Subject: [PATCH 14/19] address comments --- .../InstanceAssignmentConfigUtils.java | 4 +- ...inotInstanceAssignmentRestletResource.java | 2 +- .../helix/core/PinotHelixResourceManager.java | 2 +- ...rorServerSetInstancePartitionSelector.java | 303 +++++++++--------- .../helix/core/rebalance/TableRebalancer.java | 2 +- 5 files changed, 164 insertions(+), 149 deletions(-) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java index d3aca92aa00..30a3a19f20a 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/assignment/InstanceAssignmentConfigUtils.java @@ -125,10 +125,10 @@ public static InstanceAssignmentConfig getInstanceAssignmentConfig(TableConfig t return new InstanceAssignmentConfig(tagPoolConfig, null, replicaGroupPartitionConfig); } - public static boolean isPreConfigurationBasedAssignment(TableConfig tableConfig, + public static boolean isMirrorServerSetAssignment(TableConfig tableConfig, InstancePartitionsType instancePartitionsType) { // If the instance assignment config is not null and the partition selector is - // PRE_CONFIGURATION_BASED_PARTITION_SELECTOR, + // MIRROR_SERVER_SET_PARTITION_SELECTOR, return tableConfig.getInstanceAssignmentConfigMap().get(instancePartitionsType.toString()) != null && InstanceAssignmentConfigUtils.getInstanceAssignmentConfig(tableConfig, instancePartitionsType) .getPartitionSelector() diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java index e4ba5ba10b0..282431e04bf 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotInstanceAssignmentRestletResource.java @@ -252,7 +252,7 @@ private void assignInstancesForInstancePartitionsType(Map> _preConfiguredMirroredServerLists = new ArrayList<>(); + private final Map _preConfiguredInstanceNameToOffsetMap = new HashMap<>(); + private final List> _existingMirroredServerLists = new ArrayList<>(); // dimensions of pre-configured instance partition private int _numPreConfiguredReplicaGroups; private int _numPreConfiguredInstancesPerReplicaGroup; - // dimensions of existing instance partition private int _numExistingReplicaGroups; private int _numExistingInstancesPerReplicaGroup; - // look up tables for pre-configured instance partition - private final List> _preConfiguredMirroredServerLists = new ArrayList<>(); - private final Map _preConfiguredInstanceNameToOffsetMap = new HashMap<>(); - - private final List> _existingMirroredServerLists = new ArrayList<>(); - public MirrorServerSetInstancePartitionSelector(InstanceReplicaGroupPartitionConfig replicaGroupPartitionConfig, String tableNameWithType, @Nullable InstancePartitions existingInstancePartitions, InstancePartitions preConfiguredInstancePartitions) { @@ -96,23 +92,25 @@ private void validatePoolDiversePreconditions(Map> LOGGER.info("Validating pre-configured instance partitions for pre-configuration based replica-group selection"); // numTargetInstancesPerReplica should be positive + LOGGER.info("Number of instances per replica: {}", _numTargetInstancesPerReplicaGroup); Preconditions.checkState(_numTargetInstancesPerReplicaGroup > 0, "Number of instances per replica must be positive"); - LOGGER.info("Number of instances per replica: {}", _numTargetInstancesPerReplicaGroup); + // _numTargetReplicaGroups should be positive - Preconditions.checkState(_numTargetReplicaGroups > 0, "Number of replica-groups must be positive"); LOGGER.info("Number of replica-groups: {}", _numTargetReplicaGroups); + Preconditions.checkState(_numTargetReplicaGroups > 0, "Number of replica-groups must be positive"); + // validate target partition count is 1 + LOGGER.info("Number of partitions: {}", _replicaGroupPartitionConfig.getNumPartitions()); Preconditions.checkState(_replicaGroupPartitionConfig.getNumPartitions() <= 1, "This algorithm does not support table level partitioning for target assignment"); - LOGGER.info("Number of partitions: {}", _replicaGroupPartitionConfig.getNumPartitions()); // Validate the existing instance partitions is null or has only one partition + LOGGER.info("Number of partitions in existing instance partitions: {}", + _existingInstancePartitions == null ? 0 : _existingInstancePartitions.getNumPartitions()); Preconditions.checkState( (_existingInstancePartitions == null || _existingInstancePartitions.getNumPartitions() == 1), - "This algorithm does not support table level partitioning for existing assignment"); - LOGGER.info("Number of partitions in existing instance partitions: {}", _existingInstancePartitions == null ? 0 - : _existingInstancePartitions.getNumPartitions()); + "This algorithm does not support replica group level partitioning for existing assignment"); _numExistingReplicaGroups = _existingInstancePartitions == null ? 0 : _existingInstancePartitions.getNumReplicaGroups(); @@ -122,35 +120,41 @@ private void validatePoolDiversePreconditions(Map> // Validate the pre-configured instance partitions is not null and has only one partition Preconditions.checkState(_preConfiguredInstancePartitions != null, "Pre-configured instance partitions must be provided for pre-configuration based selection"); + LOGGER.info("Number of partitions in pre-configured instance partitions: {}", + _preConfiguredInstancePartitions.getNumPartitions()); Preconditions.checkState(_preConfiguredInstancePartitions.getNumPartitions() == 1, "This algorithm does not support table level partitioning for pre-configured assignment"); - LOGGER.info("Number of partitions in pre-configured instance partitions: {}", _preConfiguredInstancePartitions - .getNumPartitions()); // Validate the number of replica-groups in the pre-configured instance partitions is equal to the target // number of replica-groups _numPreConfiguredReplicaGroups = _preConfiguredInstancePartitions.getNumReplicaGroups(); + LOGGER.info("Number of replica-groups in pre-configured instance partitions: {}", _numPreConfiguredReplicaGroups); Preconditions.checkState(_numPreConfiguredReplicaGroups == _numTargetReplicaGroups, "The number of replica-groups %s in the pre-configured instance partitions " + "is not equal to the target number of replica-groups %s", _numPreConfiguredReplicaGroups, _numTargetReplicaGroups); - LOGGER.info("Number of replica-groups in pre-configured instance partitions: {}", _numPreConfiguredReplicaGroups); // Validate the number of instances per replica-group in the pre-configured instance partitions is greater than or // equal to the target number of instances per replica-group _numPreConfiguredInstancesPerReplicaGroup = _preConfiguredInstancePartitions.getInstances(0, 0).size(); + LOGGER.info("Number of instances per replica-group in pre-configured instance partitions: {}, target number of " + + "instances per replica-group: {}", _numPreConfiguredInstancesPerReplicaGroup, + _numTargetInstancesPerReplicaGroup); Preconditions.checkState(_numPreConfiguredInstancesPerReplicaGroup >= _numTargetInstancesPerReplicaGroup, "The number of instances per replica-group in the pre-configured " - + "instance partitions is less than the target number of instances per replica-group"); - LOGGER.info("Number of instances per replica-group in pre-configured instance partitions: {}", - _numPreConfiguredInstancesPerReplicaGroup); + + "instance partitions is less than the target number of instances per replica-group %s", + _numTargetInstancesPerReplicaGroup); // Validate the pool to instance configs map is not null or empty Preconditions.checkNotNull(poolToInstanceConfigsMap, "poolToInstanceConfigsMap is null"); int numPools = poolToInstanceConfigsMap.size(); Preconditions.checkState(numPools > 0, "No pool qualified for selection"); - Preconditions.checkState(poolToInstanceConfigsMap.values().stream().map(List::size).reduce(Integer::sum) - .orElse(0) >= _numTargetTotalInstances, + Integer totalInstanceCount = poolToInstanceConfigsMap.values().stream().map(List::size) + .reduce(Integer::sum).orElse(0); + LOGGER.info("Total number of instances in all pools: {}, target number of instances: {}", totalInstanceCount, + _numTargetTotalInstances); + Preconditions.checkState(totalInstanceCount + >= _numTargetTotalInstances, "The total number of instances in all pools is less than the target number of target instances"); HashSet availableInstanceSet = new HashSet<>(); @@ -174,7 +178,7 @@ private void validatePoolDiversePreconditions(Map> _numTargetTotalInstances, _numTargetReplicaGroups, _numTargetInstancesPerReplicaGroup); } - void createListFromPreConfiguredInstanceAssignmentMap() { + private void createMirrorServerListFromPreconfiguredInstancePartition() { List> preConfiguredReplicaGroups = new ArrayList<>(_numPreConfiguredReplicaGroups); for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { preConfiguredReplicaGroups.add(_preConfiguredInstancePartitions.getInstances(0, i)); @@ -189,7 +193,7 @@ void createListFromPreConfiguredInstanceAssignmentMap() { } } - void createLookupTablesFromPreConfiguredInstanceAssignmentMap() { + private void createMirrorServerListLookupTablesFromPreconfiguredInstancePartition() { List> preConfiguredReplicaGroups = new ArrayList<>(_numPreConfiguredReplicaGroups); for (int i = 0; i < _numPreConfiguredReplicaGroups; i++) { preConfiguredReplicaGroups.add(_preConfiguredInstancePartitions.getInstances(0, i)); @@ -206,133 +210,144 @@ void createLookupTablesFromPreConfiguredInstanceAssignmentMap() { @Override public void selectInstances(Map> poolToInstanceConfigsMap, InstancePartitions instancePartitions) { - if (_replicaGroupPartitionConfig.isReplicaGroupBased()) { - validatePoolDiversePreconditions(poolToInstanceConfigsMap); - if (_existingInstancePartitions == null) { - // If no existing instance partitions, create new instance partitions based on the pre-configured instance - // partitions. This is done by just selecting _targetNumInstancesPerReplicaGroup set of mirrored servers - // from the pre-configured instance partitions. - LOGGER.info("No existing instance partitions found. Will build new on top of" - + " the pre-configured instance partitions"); - // create a list of lists of mirrored servers from the pre-configured instance partitions - createListFromPreConfiguredInstanceAssignmentMap(); - // shuffle the list of lists of mirrored servers based on the table name hash - int tableNameHash = Math.abs(_tableNameWithType.hashCode()); - // initialize a list of indices from 0 to _numPreConfiguredInstancesPerReplicaGroup - List shuffledIndex = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); - for (int i = 0; i < _numPreConfiguredInstancesPerReplicaGroup; i++) { - shuffledIndex.add(i); - } - // shuffle the list of indices based on the table name hash - Collections.shuffle(shuffledIndex, new Random(tableNameHash)); - // select the first _numTargetInstancesPerReplicaGroup indices - shuffledIndex = shuffledIndex.subList(0, _numTargetInstancesPerReplicaGroup); - // sort the list of indices so that they follow the original order of the pre-configured instance partitions - shuffledIndex.sort(Comparator.naturalOrder()); - - // create the instance partitions based on the shuffled list of mirrored servers - List> resultReplicaGroups = new ArrayList<>(_numTargetReplicaGroups); - for (int i = 0; i < _numTargetReplicaGroups; i++) { - resultReplicaGroups.add(new ArrayList<>(_numTargetInstancesPerReplicaGroup)); - } + // throw exception instantly if not replica-group based + if (!_replicaGroupPartitionConfig.isReplicaGroupBased()) { + throw new IllegalStateException("Does not support Non-replica-group based selection"); + } - // populate the instance partitions with the selected mirrored servers - for (int j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { - for (int i = 0; i < _numTargetReplicaGroups; i++) { - resultReplicaGroups.get(i).add(_preConfiguredMirroredServerLists.get(shuffledIndex.get(j)).get(i)); - } - } - for (int i = 0; i < _numTargetReplicaGroups; i++) { - instancePartitions.setInstances(0, i, resultReplicaGroups.get(i)); - } - } else { - // If existing instance partitions exist, adjust the existing instance partitions based on the pre-configured - // instance partitions. This code path takes care of instance replacement, uplift, and downlift. - // This is done by search in the pre-configured instance partitions for the mirrored - // servers sets that are similar to the existing sets in instance partitions. - LOGGER.info("Existing instance partitions found. Will adjust the existing instance partitions" - + " based on the pre-configured instance partitions"); - createListFromPreConfiguredInstanceAssignmentMap(); - createLookupTablesFromPreConfiguredInstanceAssignmentMap(); - createListAndLookupTablesFromExistingInstancePartitions(); - Set usedPreconfiguredInstanceOffsets = new HashSet<>(); - Map> existingOffsetToResultTuple = new HashMap<>(); - - // For each instance offset, find the mirrored server that is most similar to the existing mirrored server - // set. If this mirrored server is not used, add it to the result list. - for (int j = 0; j < _numExistingInstancesPerReplicaGroup; j++) { - List existingMirroredServers = _existingMirroredServerLists.get(j); - int finalJ = j; - existingMirroredServers.stream() - .map(_preConfiguredInstanceNameToOffsetMap::get) - .filter(Objects::nonNull) - .filter(offset -> !usedPreconfiguredInstanceOffsets.contains(offset)) - .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())) - .entrySet().stream().max(Map.Entry.comparingByValue()).ifPresent(e -> { - existingOffsetToResultTuple.put(finalJ, e); - usedPreconfiguredInstanceOffsets.add(e.getKey()); - }); - } + validatePoolDiversePreconditions(poolToInstanceConfigsMap); + if (_existingInstancePartitions == null) { + // If no existing instance partitions, create new instance partitions based on the pre-configured instance + // partitions. This is done by just selecting _targetNumInstancesPerReplicaGroup set of mirrored servers + // from the pre-configured instance partitions. + initialAssignment(instancePartitions); + } else { + // If existing instance partitions exist, adjust the existing instance partitions based on the pre-configured + // instance partitions. This code path takes care of instance replacement, uplift, and downlift. + // This is done by search in the pre-configured instance partitions for the mirrored + // servers sets that are similar to the existing sets in instance partitions. + scale(instancePartitions); + } + } - if (_numExistingInstancesPerReplicaGroup > _numTargetInstancesPerReplicaGroup) { - // If this is a downlift case - List> collect = existingOffsetToResultTuple.values() - .stream() - .sorted((a, b) -> b.getValue().compareTo(a.getValue())) - .limit(_numTargetInstancesPerReplicaGroup) - .collect(Collectors.toList()); - int size = collect.size(); - existingOffsetToResultTuple.clear(); - usedPreconfiguredInstanceOffsets.clear(); - for (int j = 0; j < size; j++) { - existingOffsetToResultTuple.put(j, collect.get(j)); - usedPreconfiguredInstanceOffsets.add(collect.get(j).getKey()); - } - } + private void initialAssignment(InstancePartitions instancePartitions) { + LOGGER.info("No existing instance partitions found. Will build new on top of" + + " the pre-configured instance partitions"); + // create a list of lists of mirrored servers from the pre-configured instance partitions + createMirrorServerListFromPreconfiguredInstancePartition(); + // shuffle the list of lists of mirrored servers based on the table name hash + int tableNameHash = Math.abs(_tableNameWithType.hashCode()); + // initialize a list of indices from 0 to _numPreConfiguredInstancesPerReplicaGroup + List shuffledIndex = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); + for (int i = 0; i < _numPreConfiguredInstancesPerReplicaGroup; i++) { + shuffledIndex.add(i); + } + // shuffle the list of indices based on the table name hash + Collections.shuffle(shuffledIndex, new Random(tableNameHash)); + // select the first _numTargetInstancesPerReplicaGroup indices + shuffledIndex = shuffledIndex.subList(0, _numTargetInstancesPerReplicaGroup); + // sort the list of indices so that they follow the original order of the pre-configured instance partitions + shuffledIndex.sort(Comparator.naturalOrder()); + + // create the instance partitions based on the shuffled list of mirrored servers + List> resultReplicaGroups = new ArrayList<>(_numTargetReplicaGroups); + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.add(new ArrayList<>(_numTargetInstancesPerReplicaGroup)); + } - if (existingOffsetToResultTuple.size() < _numTargetInstancesPerReplicaGroup) { - // If the number of instances selected from the result list is less than the target number - // of instances per replica group, add the remaining instances from the pre-configured instance partitions. - ArrayList shuffledOffsets = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); - for (int j = 0; j < _numPreConfiguredInstancesPerReplicaGroup; j++) { - shuffledOffsets.add(j); - } - // Commenting this out as - // (1) Shuffling is already done in the initial step. - // (2) We want to keep the order of the pre-configured instance partitions, so that the segment assignment - // strategy for single tenant cluster can be minimized-impact. - // But keeping the code here in case we want to have a specific reordering strategy in the future. - // Collections.shuffle(shuffledOffsets, new Random(Math.abs(_tableNameWithType.hashCode()))); - for (int k = 0, j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { - if (existingOffsetToResultTuple.containsKey(j)) { - continue; - } - while (usedPreconfiguredInstanceOffsets.contains(shuffledOffsets.get(k))) { - k++; - } - Integer offset = shuffledOffsets.get(k); - existingOffsetToResultTuple.put(j, new AbstractMap.SimpleEntry<>(offset, 0L)); - usedPreconfiguredInstanceOffsets.add(offset); - } - } + // populate the instance partitions with the selected mirrored servers + for (int j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.get(i).add(_preConfiguredMirroredServerLists.get(shuffledIndex.get(j)).get(i)); + } + } + for (int i = 0; i < _numTargetReplicaGroups; i++) { + instancePartitions.setInstances(0, i, resultReplicaGroups.get(i)); + } + } - List> resultReplicaGroups = new ArrayList<>(_numTargetReplicaGroups); - for (int i = 0; i < _numTargetReplicaGroups; i++) { - resultReplicaGroups.add(new ArrayList<>(_numTargetInstancesPerReplicaGroup)); - } - for (int j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { - List mirrorServers = - _preConfiguredMirroredServerLists.get(existingOffsetToResultTuple.get(j).getKey()); - for (int i = 0; i < _numTargetReplicaGroups; i++) { - resultReplicaGroups.get(i).add(mirrorServers.get(i)); - } + private void scale(InstancePartitions instancePartitions) { + LOGGER.info("Existing instance partitions found. Will adjust the existing instance partitions" + + " based on the pre-configured instance partitions"); + createMirrorServerListFromPreconfiguredInstancePartition(); + createMirrorServerListLookupTablesFromPreconfiguredInstancePartition(); + createListAndLookupTablesFromExistingInstancePartitions(); + Set usedPreconfiguredInstanceOffsets = new HashSet<>(); + Map> existingOffsetToResultTuple = new HashMap<>(); + + // For each instance offset, find the mirrored server that is most similar to the existing mirrored server + // set. If this mirrored server is not used, add it to the result list. + for (int j = 0; j < _numExistingInstancesPerReplicaGroup; j++) { + List existingMirroredServers = _existingMirroredServerLists.get(j); + int finalJ = j; + existingMirroredServers.stream() + .map(_preConfiguredInstanceNameToOffsetMap::get) + .filter(Objects::nonNull) + .filter(offset -> !usedPreconfiguredInstanceOffsets.contains(offset)) + .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())) + .entrySet() + .stream() + .max(Map.Entry.comparingByValue()) + .ifPresent(e -> { + existingOffsetToResultTuple.put(finalJ, e); + usedPreconfiguredInstanceOffsets.add(e.getKey()); + }); + } + + if (_numExistingInstancesPerReplicaGroup > _numTargetInstancesPerReplicaGroup) { + // If this is a downlift case + List> collect = existingOffsetToResultTuple.values() + .stream() + .sorted((a, b) -> b.getValue().compareTo(a.getValue())) + .limit(_numTargetInstancesPerReplicaGroup) + .collect(Collectors.toList()); + int size = collect.size(); + existingOffsetToResultTuple.clear(); + usedPreconfiguredInstanceOffsets.clear(); + for (int j = 0; j < size; j++) { + existingOffsetToResultTuple.put(j, collect.get(j)); + usedPreconfiguredInstanceOffsets.add(collect.get(j).getKey()); + } + } + + if (existingOffsetToResultTuple.size() < _numTargetInstancesPerReplicaGroup) { + // If the number of instances selected from the result list is less than the target number + // of instances per replica group, add the remaining instances from the pre-configured instance partitions. + ArrayList shuffledOffsets = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); + for (int j = 0; j < _numPreConfiguredInstancesPerReplicaGroup; j++) { + shuffledOffsets.add(j); + } + // Commenting this out as + // (1) Shuffling is already done in the initial step. + // (2) We want to keep the order of the pre-configured instance partitions, so that the segment assignment + // strategy for single tenant cluster can be minimized-impact. + // But keeping the code here in case we want to have a specific reordering strategy in the future. + // Collections.shuffle(shuffledOffsets, new Random(Math.abs(_tableNameWithType.hashCode()))); + for (int k = 0, j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { + if (existingOffsetToResultTuple.containsKey(j)) { + continue; } - for (int i = 0; i < _numTargetReplicaGroups; i++) { - instancePartitions.setInstances(0, i, resultReplicaGroups.get(i)); + while (usedPreconfiguredInstanceOffsets.contains(shuffledOffsets.get(k))) { + k++; } + Integer offset = shuffledOffsets.get(k); + existingOffsetToResultTuple.put(j, new AbstractMap.SimpleEntry<>(offset, 0L)); + usedPreconfiguredInstanceOffsets.add(offset); } - } else { - throw new IllegalStateException("Does not support Non-replica-group based selection"); + } + + List> resultReplicaGroups = new ArrayList<>(_numTargetReplicaGroups); + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.add(new ArrayList<>(_numTargetInstancesPerReplicaGroup)); + } + for (int j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { + List mirrorServers = _preConfiguredMirroredServerLists.get(existingOffsetToResultTuple.get(j).getKey()); + for (int i = 0; i < _numTargetReplicaGroups; i++) { + resultReplicaGroups.get(i).add(mirrorServers.get(i)); + } + } + for (int i = 0; i < _numTargetReplicaGroups; i++) { + instancePartitions.setInstances(0, i, resultReplicaGroups.get(i)); } } diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java index 28c61fac6f4..efd04d3bd61 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java @@ -528,7 +528,7 @@ private Pair getInstancePartitions(TableConfig tabl boolean hasPreConfiguredInstancePartitions = TableConfigUtils.hasPreConfiguredInstancePartitions(tableConfig, instancePartitionsType); boolean isPreConfigurationBasedAssignment = - InstanceAssignmentConfigUtils.isPreConfigurationBasedAssignment(tableConfig, instancePartitionsType); + InstanceAssignmentConfigUtils.isMirrorServerSetAssignment(tableConfig, instancePartitionsType); InstanceAssignmentDriver instanceAssignmentDriver = new InstanceAssignmentDriver(tableConfig); InstancePartitions instancePartitions; boolean instancePartitionsUnchanged; From c6465677dfe727cf02f2202433010dcdb56b8d92 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Wed, 25 Oct 2023 20:14:52 -0700 Subject: [PATCH 15/19] address comments --- .../controller/api/resources/PinotTenantRestletResource.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java index 58fa81cf47c..8166427a938 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/api/resources/PinotTenantRestletResource.java @@ -326,7 +326,8 @@ public InstancePartitions getInstancePartitions( @Produces(MediaType.APPLICATION_JSON) @ApiOperation(value = "Update an instance partition for a server type in a tenant") @ApiResponses(value = {@ApiResponse(code = 200, message = "Success", response = InstancePartitions.class), - @ApiResponse(code = 400, message = "Failed to update the tenant")}) + @ApiResponse(code = 400, message = "Failed to deserialize/validate the instance partitions"), + @ApiResponse(code = 500, message = "Error updating the tenant")}) public InstancePartitions assignInstancesPartitionMap( @ApiParam(value = "Tenant name ", required = true) @PathParam("tenantName") String tenantName, @ApiParam(value = "instancePartitionType (OFFLINE|CONSUMING|COMPLETED)", required = true, From 367af71c66bcecec96370d904950afd0986dc42b Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 26 Oct 2023 15:39:15 -0700 Subject: [PATCH 16/19] change table validation logic --- .../segment/local/utils/TableConfigUtils.java | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java index 71e47c7c62c..aad0a992e74 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java @@ -766,11 +766,25 @@ static void validateInstancePartitionsTypeMapConfig(TableConfig tableConfig) { tableConfig.getInstanceAssignmentConfigMap())) { return; } - for (InstancePartitionsType instancePartitionsType : tableConfig.getInstancePartitionsMap().keySet()) { - Preconditions.checkState( - !tableConfig.getInstanceAssignmentConfigMap().containsKey(instancePartitionsType.toString()), - String.format("Both InstanceAssignmentConfigMap and InstancePartitionsMap set for %s", - instancePartitionsType)); + + for (InstancePartitionsType instancePartitionsType : InstancePartitionsType.values()) { + if (tableConfig.getInstanceAssignmentConfigMap().containsKey(instancePartitionsType.toString())) { + InstanceAssignmentConfig instanceAssignmentConfig = + tableConfig.getInstanceAssignmentConfigMap().get(instancePartitionsType.toString()); + if (instanceAssignmentConfig.getPartitionSelector() + == InstanceAssignmentConfig.PartitionSelector.MIRROR_SERVER_SET_PARTITION_SELECTOR) { + Preconditions.checkState( + tableConfig.getInstancePartitionsMap().containsKey(instancePartitionsType), + String.format("Both InstanceAssignmentConfigMap and InstancePartitionsMap needed for %s, as " + + "MIRROR_SERVER_SET_PARTITION_SELECTOR is used", + instancePartitionsType)); + } else { + Preconditions.checkState( + !tableConfig.getInstancePartitionsMap().containsKey(instancePartitionsType), + String.format("Both InstanceAssignmentConfigMap and InstancePartitionsMap set for %s", + instancePartitionsType)); + } + } } } From b044af09b6faa1f1a9ecbc31df2517fb107f2ed5 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 26 Oct 2023 16:34:15 -0700 Subject: [PATCH 17/19] Trigger Test From 545e7d05a57d1212355a5ba40b6742d752879225 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 26 Oct 2023 23:16:29 -0700 Subject: [PATCH 18/19] update shuffling logic --- ...rorServerSetInstancePartitionSelector.java | 20 +++++++++---------- .../instance/InstanceAssignmentTest.java | 8 ++++---- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java index 08ad4d9de23..6b4086615a5 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/assignment/instance/MirrorServerSetInstancePartitionSelector.java @@ -313,24 +313,22 @@ private void scale(InstancePartitions instancePartitions) { if (existingOffsetToResultTuple.size() < _numTargetInstancesPerReplicaGroup) { // If the number of instances selected from the result list is less than the target number // of instances per replica group, add the remaining instances from the pre-configured instance partitions. - ArrayList shuffledOffsets = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); + List shuffledOffsets = new ArrayList<>(_numPreConfiguredInstancesPerReplicaGroup); for (int j = 0; j < _numPreConfiguredInstancesPerReplicaGroup; j++) { shuffledOffsets.add(j); } - // Commenting this out as - // (1) Shuffling is already done in the initial step. - // (2) We want to keep the order of the pre-configured instance partitions, so that the segment assignment - // strategy for single tenant cluster can be minimized-impact. - // But keeping the code here in case we want to have a specific reordering strategy in the future. - // Collections.shuffle(shuffledOffsets, new Random(Math.abs(_tableNameWithType.hashCode()))); + for (Map.Entry> entry : existingOffsetToResultTuple.entrySet()) { + shuffledOffsets.remove(entry.getValue().getKey()); + } + Collections.shuffle(shuffledOffsets, new Random(Math.abs(_tableNameWithType.hashCode()))); + shuffledOffsets = + shuffledOffsets.subList(0, _numTargetInstancesPerReplicaGroup - existingOffsetToResultTuple.size()); + shuffledOffsets.sort(Comparator.naturalOrder()); for (int k = 0, j = 0; j < _numTargetInstancesPerReplicaGroup; j++) { if (existingOffsetToResultTuple.containsKey(j)) { continue; } - while (usedPreconfiguredInstanceOffsets.contains(shuffledOffsets.get(k))) { - k++; - } - Integer offset = shuffledOffsets.get(k); + Integer offset = shuffledOffsets.get(k++); existingOffsetToResultTuple.put(j, new AbstractMap.SimpleEntry<>(offset, 0L)); usedPreconfiguredInstanceOffsets.add(offset); } diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java index 84e92a86354..b25a529e101 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/assignment/instance/InstanceAssignmentTest.java @@ -336,11 +336,11 @@ public void testDefaultOfflineReplicaGroup() { Arrays.asList(SERVER_INSTANCE_ID_PREFIX + 11, SERVER_INSTANCE_ID_PREFIX + 0)); } - public void testPreConfigurationBasedRandom() throws FileNotFoundException { - testPreConfigurationBasedRandomInner(10000000); + public void testMirrorServerSetBasedRandom() throws FileNotFoundException { + testMirrorServerSetBasedRandomInner(10000000); } - public void testPreConfigurationBasedRandomInner(int loopCount) throws FileNotFoundException { + public void testMirrorServerSetBasedRandomInner(int loopCount) throws FileNotFoundException { PrintStream o = new PrintStream("output.txt"); System.setOut(o); for (int iter = 0; iter < loopCount; iter++) { @@ -451,7 +451,7 @@ public void testPreConfigurationBasedRandomInner(int loopCount) throws FileNotFo } @Test - public void testPreConfigurationBased() { + public void testMirrorServerSetBased() { LogManager.getLogger(MirrorServerSetInstancePartitionSelector.class) .setLevel(Level.INFO); From 8479b70bccaab26bd70f49d39aaadf9e040acda9 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Fri, 27 Oct 2023 15:05:56 -0700 Subject: [PATCH 19/19] add more logging --- .../pinot/controller/helix/core/PinotHelixResourceManager.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java index 37d2e29f802..d3cf819bf76 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java @@ -1749,7 +1749,8 @@ private void assignInstances(TableConfig tableConfig, boolean override) { referenceInstancePartitionsName, instancePartitionsType.getInstancePartitionsName(rawTableName)); instancePartitions = instanceAssignmentDriver.assignInstances(instancePartitionsType, instanceConfigs, null, preConfiguredInstancePartitions); - LOGGER.info("Persisting instance partitions: {}", instancePartitions); + LOGGER.info("Persisting instance partitions: {} (based on {})", instancePartitions, + preConfiguredInstancePartitions); } else { instancePartitions = InstancePartitionsUtils.fetchInstancePartitionsWithRename(_propertyStore, referenceInstancePartitionsName, instancePartitionsType.getInstancePartitionsName(rawTableName));