From 3d149074e143ec685a3d079e9acf33bd9e0e6b40 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 25 Apr 2017 16:37:08 -0700 Subject: [PATCH 01/41] Moved patch to (new) branch. Updated to master --- .../org/apache/solr/cloud/AddReplicaCmd.java | 9 +- .../java/org/apache/solr/cloud/Assign.java | 11 +- .../apache/solr/cloud/CloudDescriptor.java | 17 +- .../solr/cloud/CreateCollectionCmd.java | 38 +- .../apache/solr/cloud/ElectionContext.java | 13 +- .../org/apache/solr/cloud/MigrateCmd.java | 8 +- .../org/apache/solr/cloud/MoveReplicaCmd.java | 4 +- .../OverseerCollectionMessageHandler.java | 89 +- .../apache/solr/cloud/RecoveryStrategy.java | 182 +++- .../solr/cloud/ReplicateFromLeader.java | 37 +- .../org/apache/solr/cloud/SplitShardCmd.java | 2 +- .../org/apache/solr/cloud/ZkController.java | 50 +- .../solr/cloud/overseer/ReplicaMutator.java | 7 +- .../solr/cloud/overseer/SliceMutator.java | 26 +- .../solr/cloud/overseer/ZkStateWriter.java | 1 + .../solr/cloud/rule/ReplicaAssigner.java | 6 +- .../org/apache/solr/core/CoreContainer.java | 37 +- .../org/apache/solr/handler/IndexFetcher.java | 17 +- .../handler/admin/CollectionsHandler.java | 15 +- .../solr/handler/admin/CoreAdminHandler.java | 3 + .../solr/handler/admin/PrepRecoveryOp.java | 3 +- .../component/RealTimeGetComponent.java | 5 + .../solr/update/DefaultSolrCoreState.java | 10 +- .../solr/update/DirectUpdateHandler2.java | 20 +- .../org/apache/solr/update/UpdateCommand.java | 1 + .../org/apache/solr/update/UpdateHandler.java | 6 +- .../org/apache/solr/update/UpdateLog.java | 5 +- .../processor/DistributedUpdateProcessor.java | 38 +- solr/core/src/test-files/log4j.properties | 12 +- .../cloud-minimal/conf/solrconfig.xml | 2 +- .../solr/cloud/BasicDistributedZk2Test.java | 4 +- .../solr/cloud/BasicDistributedZkTest.java | 4 +- .../cloud/ChaosMonkeyNothingIsSafeTest.java | 4 +- .../CollectionsAPIDistributedZkTest.java | 4 +- .../solr/cloud/CollectionsAPISolrJTest.java | 2 +- .../apache/solr/cloud/ForceLeaderTest.java | 4 +- .../apache/solr/cloud/HttpPartitionTest.java | 4 +- .../LeaderInitiatedRecoveryOnCommitTest.java | 4 +- .../solr/cloud/OnlyLeaderIndexesTest.java | 63 +- ...rseerCollectionConfigSetProcessorTest.java | 3 +- .../org/apache/solr/cloud/OverseerTest.java | 4 +- .../cloud/RecoveryAfterSoftCommitTest.java | 4 +- .../org/apache/solr/cloud/ShardSplitTest.java | 11 +- .../apache/solr/cloud/TestAppendReplica.java | 808 ++++++++++++++++++ .../apache/solr/cloud/TestCloudRecovery.java | 3 +- .../apache/solr/cloud/TestCollectionAPI.java | 3 +- .../apache/solr/cloud/TestPassiveReplica.java | 549 ++++++++++++ .../hdfs/HdfsBasicDistributedZkTest.java | 4 +- .../solr/SolrCloudReportersTest.java | 2 +- .../update/TestInPlaceUpdatesDistrib.java | 4 +- .../client/solrj/impl/CloudSolrClient.java | 3 +- .../solrj/request/CollectionAdminRequest.java | 100 ++- .../solr/common/cloud/DocCollection.java | 23 +- .../org/apache/solr/common/cloud/Replica.java | 32 + .../org/apache/solr/common/cloud/Slice.java | 21 +- .../solr/common/cloud/ZkStateReader.java | 19 +- .../solr/common/params/CoreAdminParams.java | 5 + ...lectionAdminRequestRequiredParamsTest.java | 9 + .../java/org/apache/solr/SolrTestCaseJ4.java | 2 +- .../solr/cloud/AbstractDistribZkTestBase.java | 3 + .../cloud/AbstractFullDistribZkTestBase.java | 52 +- .../org/apache/solr/cloud/ChaosMonkey.java | 4 +- 62 files changed, 2143 insertions(+), 292 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java create mode 100644 solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java diff --git a/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java index 6bb33508edbd..4420a9209fca 100644 --- a/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java @@ -72,6 +72,7 @@ ZkNodeProps addReplica(ClusterState clusterState, ZkNodeProps message, NamedList String node = message.getStr(CoreAdminParams.NODE); String shard = message.getStr(SHARD_ID_PROP); String coreName = message.getStr(CoreAdminParams.NAME); + Replica.Type replicaType = Replica.Type.valueOf(message.getStr(ZkStateReader.REPLICA_TYPE, Replica.Type.REALTIME.name())); boolean parallel = message.getBool("parallel", false); if (StringUtils.isBlank(coreName)) { coreName = message.getStr(CoreAdminParams.PROPERTY_PREFIX + CoreAdminParams.NAME); @@ -93,7 +94,7 @@ ZkNodeProps addReplica(ClusterState clusterState, ZkNodeProps message, NamedList // Kind of unnecessary, but it does put the logic of whether to override maxShardsPerNode in one place. if (!skipCreateReplicaInClusterState) { node = getNodesForNewReplicas(clusterState, collection, shard, 1, node, - ocmh.overseer.getZkController().getCoreContainer()).get(0).nodeName; + ocmh.overseer.getZkController().getCoreContainer()).get(0).nodeName;// TODO: use replica type in this logic too } log.info("Node Identified {} for creating new replica", node); @@ -101,7 +102,7 @@ ZkNodeProps addReplica(ClusterState clusterState, ZkNodeProps message, NamedList throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Node: " + node + " is not live"); } if (coreName == null) { - coreName = Assign.buildCoreName(coll, shard); + coreName = Assign.buildCoreName(coll, shard, replicaType); } else if (!skipCreateReplicaInClusterState) { //Validate that the core name is unique in that collection for (Slice slice : coll.getSlices()) { @@ -126,7 +127,8 @@ ZkNodeProps addReplica(ClusterState clusterState, ZkNodeProps message, NamedList ZkStateReader.CORE_NAME_PROP, coreName, ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), ZkStateReader.BASE_URL_PROP, zkStateReader.getBaseUrlForNodeName(node), - ZkStateReader.NODE_NAME_PROP, node); + ZkStateReader.NODE_NAME_PROP, node, + ZkStateReader.REPLICA_TYPE, replicaType.name()); Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props)); } params.set(CoreAdminParams.CORE_NODE_NAME, @@ -142,6 +144,7 @@ ZkNodeProps addReplica(ClusterState clusterState, ZkNodeProps message, NamedList params.set(CoreAdminParams.NAME, coreName); params.set(COLL_CONF, configName); params.set(CoreAdminParams.COLLECTION, collection); + params.set(CoreAdminParams.REPLICA_TYPE, replicaType.name()); if (shard != null) { params.set(CoreAdminParams.SHARD, shard); } else if (routeKey != null) { diff --git a/solr/core/src/java/org/apache/solr/cloud/Assign.java b/solr/core/src/java/org/apache/solr/cloud/Assign.java index ba03ccd2c26f..ca784e554d4d 100644 --- a/solr/core/src/java/org/apache/solr/cloud/Assign.java +++ b/solr/core/src/java/org/apache/solr/cloud/Assign.java @@ -107,12 +107,16 @@ public static String assignShard(DocCollection collection, Integer numShards) { returnShardId = shardIdNames.get(0); return returnShardId; } + + public static String buildCoreName(String collectionName, String shard, Replica.Type type, int replicaNum) { + return collectionName + "_" + shard + "_replica_" + type.name().substring(0,1).toLowerCase() + replicaNum; + } - static String buildCoreName(DocCollection collection, String shard) { + public static String buildCoreName(DocCollection collection, String shard, Replica.Type type) { Slice slice = collection.getSlice(shard); int replicaNum = slice.getReplicas().size(); for (; ; ) { - String replicaName = collection.getName() + "_" + shard + "_replica" + replicaNum; + String replicaName = buildCoreName(collection.getName(), shard, type, replicaNum); boolean exists = false; for (Replica replica : slice.getReplicas()) { if (replicaName.equals(replica.getStr(CORE_NAME_PROP))) { @@ -121,9 +125,8 @@ static String buildCoreName(DocCollection collection, String shard) { } } if (exists) replicaNum++; - else break; + else return replicaName; } - return collection.getName() + "_" + shard + "_replica" + replicaNum; } static class ReplicaCount { diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java index 719b1d171613..ff29afc60b50 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java +++ b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java @@ -44,6 +44,13 @@ public class CloudDescriptor { volatile Replica.State lastPublished = Replica.State.ACTIVE; public static final String NUM_SHARDS = "numShards"; + + public static final String REPLICA_TYPE = "replicaType"; + + /** + * The type of replica this core hosts + */ + private final Replica.Type replicaType; public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { this.cd = cd; @@ -57,7 +64,7 @@ public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { if (Strings.isNullOrEmpty(nodeName)) this.nodeName = null; this.numShards = PropertiesUtil.toInteger(props.getProperty(CloudDescriptor.NUM_SHARDS), null); - + this.replicaType = Replica.Type.valueOf(props.getProperty(CloudDescriptor.REPLICA_TYPE, Replica.Type.REALTIME.name())); for (String propName : props.stringPropertyNames()) { if (propName.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) { collectionParams.put(propName.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), props.getProperty(propName)); @@ -65,6 +72,10 @@ public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { } } + public boolean requiresTransactionLog() { + return this.replicaType != Replica.Type.PASSIVE; + } + public Replica.State getLastPublished() { return lastPublished; } @@ -155,4 +166,8 @@ public void reload(CloudDescriptor reloadFrom) { collectionParams.put(ent.getKey(), ent.getValue()); } } + + public Replica.Type getReplicaType() { + return replicaType; + } } diff --git a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java index a1bb70e36ab9..41b842057631 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java @@ -60,8 +60,7 @@ import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.NUM_SLICES; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.RANDOM; -import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE; -import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; +import static org.apache.solr.common.cloud.ZkStateReader.*; import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA; import static org.apache.solr.common.params.CommonAdminParams.ASYNC; import static org.apache.solr.common.params.CommonParams.NAME; @@ -96,7 +95,9 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul // look at the replication factor and see if it matches reality // if it does not, find best nodes to create more cores - int repFactor = message.getInt(REPLICATION_FACTOR, 1); + int numRealtimeReplicas = message.getInt(REALTIME_REPLICAS, message.getInt(REPLICATION_FACTOR, 1)); + int numPassiveReplicas = message.getInt(PASSIVE_REPLICAS, 0); + int numAppendReplicas = message.getInt(APPEND_REPLICAS, 0); ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(); final String async = message.getStr(ASYNC); @@ -116,8 +117,8 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul int maxShardsPerNode = message.getInt(MAX_SHARDS_PER_NODE, 1); - if (repFactor <= 0) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, REPLICATION_FACTOR + " must be greater than 0"); + if (numRealtimeReplicas + numAppendReplicas <= 0) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, REALTIME_REPLICAS + " + " + APPEND_REPLICAS + " must be greater than 0"); } if (numSlices <= 0) { @@ -135,11 +136,11 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul positionVsNodes = new HashMap<>(); } else { - if (repFactor > nodeList.size()) { + if (numRealtimeReplicas > nodeList.size()) { log.warn("Specified " - + REPLICATION_FACTOR + + REALTIME_REPLICAS + " of " - + repFactor + + numRealtimeReplicas + " on collection " + collectionName + " is higher than or equal to the number of Solr instances currently live or live and part of your " + CREATE_NODE_SET + "(" @@ -148,19 +149,21 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul } int maxShardsAllowedToCreate = maxShardsPerNode * nodeList.size(); - int requestedShardsToCreate = numSlices * repFactor; + int requestedShardsToCreate = numSlices * (numRealtimeReplicas + numPassiveReplicas + numAppendReplicas); if (maxShardsAllowedToCreate < requestedShardsToCreate) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Cannot create collection " + collectionName + ". Value of " + MAX_SHARDS_PER_NODE + " is " + maxShardsPerNode + ", and the number of nodes currently live or live and part of your "+CREATE_NODE_SET+" is " + nodeList.size() + ". This allows a maximum of " + maxShardsAllowedToCreate + " to be created. Value of " + NUM_SLICES + " is " + numSlices - + " and value of " + REPLICATION_FACTOR + " is " + repFactor + + ", value of " + REALTIME_REPLICAS + " is " + numRealtimeReplicas + + ", value of " + APPEND_REPLICAS + " is " + numAppendReplicas + + " and value of " + PASSIVE_REPLICAS + " is " + numPassiveReplicas + ". This requires " + requestedShardsToCreate + " shards to be created (higher than the allowed number)"); } - positionVsNodes = ocmh.identifyNodes(clusterState, nodeList, message, shardNames, repFactor); + positionVsNodes = ocmh.identifyNodes(clusterState, nodeList, message, shardNames, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas); } ZkStateReader zkStateReader = ocmh.zkStateReader; @@ -200,13 +203,15 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul Map requestMap = new HashMap<>(); - log.debug(formatString("Creating SolrCores for new collection {0}, shardNames {1} , replicationFactor : {2}", - collectionName, shardNames, repFactor)); + log.debug(formatString("Creating SolrCores for new collection {0}, shardNames {1} , realtimeReplicas : {2}, appendReplicas: {3}, passiveReplicas: {4}", + collectionName, shardNames, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas)); Map coresToCreate = new LinkedHashMap<>(); for (Map.Entry e : positionVsNodes.entrySet()) { ReplicaAssigner.Position position = e.getKey(); String nodeName = e.getValue(); - String coreName = collectionName + "_" + position.shard + "_replica" + (position.index + 1); + // TODO: Adding the suffix is great for debugging, but may be an issue if at some point we want to support a way to change replica type +// String coreName = collectionName + "_" + position.shard + "_replica" + position.suffix + (position.index + 1); + String coreName = Assign.buildCoreName(collectionName, position.shard, position.type, position.index + 1); log.debug(formatString("Creating core {0} as part of shard {1} of collection {2} on {3}" , coreName, position.shard, collectionName, nodeName)); @@ -221,7 +226,8 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul ZkStateReader.SHARD_ID_PROP, position.shard, ZkStateReader.CORE_NAME_PROP, coreName, ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), - ZkStateReader.BASE_URL_PROP, baseUrl); + ZkStateReader.BASE_URL_PROP, baseUrl, + ZkStateReader.REPLICA_TYPE, position.type.name()); Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props)); } @@ -235,6 +241,8 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul params.set(CoreAdminParams.SHARD, position.shard); params.set(ZkStateReader.NUM_SHARDS_PROP, numSlices); params.set(CoreAdminParams.NEW_COLLECTION, "true"); + // This is used to tell the CoreAdminHandler that the new core doesn't need a tlog in case of passive replicas + params.set(CoreAdminParams.REPLICA_TYPE, position.type.name()); if (async != null) { String coreAdminAsyncId = async + Math.abs(System.nanoTime()); diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java index bdbeca9d568c..21549bcea25f 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java +++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; +import java.util.EnumSet; import java.util.List; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; @@ -326,6 +327,8 @@ void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStart) throws Kee return; } + Replica.Type replicaType; + try (SolrCore core = cc.getCore(coreName)) { if (core == null) { @@ -338,6 +341,8 @@ void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStart) throws Kee } } + replicaType = core.getCoreDescriptor().getCloudDescriptor().getReplicaType(); + // should I be leader? if (weAreReplacement && !shouldIBeLeader(leaderProps, core, weAreReplacement)) { rejoinLeaderElection(core); @@ -423,9 +428,7 @@ void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStart) throws Kee try { // we must check LIR before registering as leader checkLIR(coreName, allReplicasInLine); - - boolean onlyLeaderIndexes = zkController.getClusterState().getCollection(collection).getRealtimeReplicas() == 1; - if (onlyLeaderIndexes) { + if (replicaType == Replica.Type.APPEND) { // stop replicate from old leader zkController.stopReplicationFromLeader(coreName); if (weAreReplacement) { @@ -621,7 +624,7 @@ private boolean waitForReplicasToComeUp(int timeoutms) throws InterruptedExcepti } // on startup and after connection timeout, wait for all known shards - if (found >= slices.getReplicasMap().size()) { + if (found >= slices.getReplicas(EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)).size()) { log.info("Enough replicas found to continue."); return true; } else { @@ -629,7 +632,7 @@ private boolean waitForReplicasToComeUp(int timeoutms) throws InterruptedExcepti log.info("Waiting until we see more replicas up for shard {}: total={}" + " found={}" + " timeoutin={}ms", - shardId, slices.getReplicasMap().size(), found, + shardId, slices.getReplicas(EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)).size(), found, TimeUnit.MILLISECONDS.convert(timeoutAt - System.nanoTime(), TimeUnit.NANOSECONDS)); } } diff --git a/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java b/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java index 7b1ad2c2b868..a1a41bb8207b 100644 --- a/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java @@ -51,7 +51,7 @@ import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.NUM_SLICES; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; +import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE; @@ -208,7 +208,7 @@ private void migrateKey(ClusterState clusterState, DocCollection sourceCollectio Map props = makeMap( Overseer.QUEUE_OPERATION, CREATE.toLower(), NAME, tempSourceCollectionName, - REPLICATION_FACTOR, 1, + REALTIME_REPLICAS, 1, NUM_SLICES, 1, COLL_CONF, configName, CREATE_NODE_SET, sourceLeader.getNodeName()); @@ -224,7 +224,7 @@ private void migrateKey(ClusterState clusterState, DocCollection sourceCollectio Slice tempSourceSlice = clusterState.getCollection(tempSourceCollectionName).getSlices().iterator().next(); Replica tempSourceLeader = zkStateReader.getLeaderRetry(tempSourceCollectionName, tempSourceSlice.getName(), 120000); - String tempCollectionReplica1 = tempSourceCollectionName + "_" + tempSourceSlice.getName() + "_replica1"; + String tempCollectionReplica1 = Assign.buildCoreName(tempSourceCollectionName, tempSourceSlice.getName(), Replica.Type.REALTIME, 1); String coreNodeName = ocmh.waitForCoreNodeName(tempSourceCollectionName, sourceLeader.getNodeName(), tempCollectionReplica1); // wait for the replicas to be seen as active on temp source leader @@ -257,7 +257,7 @@ private void migrateKey(ClusterState clusterState, DocCollection sourceCollectio log.info("Creating a replica of temporary collection: {} on the target leader node: {}", tempSourceCollectionName, targetLeader.getNodeName()); - String tempCollectionReplica2 = tempSourceCollectionName + "_" + tempSourceSlice.getName() + "_replica2"; + String tempCollectionReplica2 = Assign.buildCoreName(tempSourceCollectionName, tempSourceSlice.getName(), Replica.Type.REALTIME, 2); props = new HashMap<>(); props.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower()); props.put(COLLECTION_PROP, tempSourceCollectionName); diff --git a/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java index 545989e22d0f..fed1398e8580 100644 --- a/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java @@ -111,7 +111,7 @@ private void moveReplica(ClusterState clusterState, ZkNodeProps message, NamedLi private void moveHdfsReplica(ClusterState clusterState, NamedList results, String dataDir, String targetNode, String async, DocCollection coll, Replica replica, Slice slice) throws Exception { - String newCoreName = Assign.buildCoreName(coll, slice.getName()); + String newCoreName = Assign.buildCoreName(coll, slice.getName(), replica.getType()); ZkNodeProps removeReplicasProps = new ZkNodeProps( COLLECTION_PROP, coll.getName(), @@ -155,7 +155,7 @@ private void moveHdfsReplica(ClusterState clusterState, NamedList results, Strin private void moveNormalReplica(ClusterState clusterState, NamedList results, String targetNode, String async, DocCollection coll, Replica replica, Slice slice) throws Exception { - String newCoreName = Assign.buildCoreName(coll, slice.getName()); + String newCoreName = Assign.buildCoreName(coll, slice.getName(), replica.getType()); ZkNodeProps addReplicasProps = new ZkNodeProps( COLLECTION_PROP, coll.getName(), SHARD_ID_PROP, slice.getName(), diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java index 2c083051626d..4cae7856e479 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java @@ -16,6 +16,51 @@ */ package org.apache.solr.cloud; +import static org.apache.solr.common.cloud.DocCollection.SNITCH; +import static org.apache.solr.common.cloud.ZkStateReader.BASE_URL_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_VALUE_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICAPROP; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDROLE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.BACKUP; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.BALANCESHARDUNIQUE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATEALIAS; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESNAPSHOT; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEALIAS; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETENODE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICAPROP; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESNAPSHOT; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.MIGRATE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.MIGRATESTATEFORMAT; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOCK_COLL_TASK; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOCK_REPLICA_TASK; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOCK_SHARD_TASK; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOVEREPLICA; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.MODIFYCOLLECTION; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.OVERSEERSTATUS; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.REBALANCELEADERS; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.RELOAD; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.REMOVEROLE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.REPLACENODE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.RESTORE; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.SPLITSHARD; +import static org.apache.solr.common.params.CommonAdminParams.ASYNC; +import static org.apache.solr.common.params.CommonParams.NAME; +import static org.apache.solr.common.util.Utils.makeMap; + import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; @@ -33,7 +78,6 @@ import java.util.concurrent.SynchronousQueue; import java.util.concurrent.TimeUnit; -import com.google.common.collect.ImmutableMap; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrResponse; import org.apache.solr.client.solrj.SolrServerException; @@ -79,21 +123,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.solr.common.cloud.DocCollection.SNITCH; -import static org.apache.solr.common.cloud.ZkStateReader.BASE_URL_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_VALUE_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; -import static org.apache.solr.common.params.CollectionParams.CollectionAction.*; -import static org.apache.solr.common.params.CommonAdminParams.ASYNC; -import static org.apache.solr.common.params.CommonParams.NAME; -import static org.apache.solr.common.util.Utils.makeMap; +import com.google.common.collect.ImmutableMap; /** * A {@link OverseerMessageHandler} that handles Collections API related @@ -126,12 +156,12 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler static final String SKIP_CREATE_REPLICA_IN_CLUSTER_STATE = "skipCreateReplicaInClusterState"; + //nocommit: review public static final Map COLL_PROPS = Collections.unmodifiableMap(makeMap( ROUTER, DocRouter.DEFAULT_NAME, ZkStateReader.REPLICATION_FACTOR, "1", ZkStateReader.MAX_SHARDS_PER_NODE, "1", ZkStateReader.AUTO_ADD_REPLICAS, "false", - ZkStateReader.REALTIME_REPLICAS, "-1", DocCollection.RULE, null, SNITCH, null)); @@ -700,18 +730,33 @@ Map identifyNodes(ClusterState clusterState, List nodeList, ZkNodeProps message, List shardNames, - int repFactor) throws IOException { + int numRealtimeReplicas, + int numAppendReplicas, + int numPassiveReplicas) throws IOException { List rulesMap = (List) message.get("rule"); if (rulesMap == null) { int i = 0; Map result = new HashMap<>(); for (String aShard : shardNames) { - for (int j = 0; j < repFactor; j++){ - result.put(new Position(aShard, j), nodeList.get(i % nodeList.size())); + for (int j = 0; j < numRealtimeReplicas; j++){ + result.put(new Position(aShard, j, Replica.Type.REALTIME), nodeList.get(i % nodeList.size())); + i++; + } + for (int j = 0; j < numAppendReplicas; j++){ + result.put(new Position(aShard, j, Replica.Type.APPEND), nodeList.get(i % nodeList.size())); + i++; + } + for (int j = 0; j < numPassiveReplicas; j++){ + result.put(new Position(aShard, j, Replica.Type.PASSIVE), nodeList.get(i % nodeList.size())); i++; } } return result; + } else { + if (numAppendReplicas + numPassiveReplicas != 0) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + Replica.Type.APPEND + " or " + Replica.Type.PASSIVE + " replica types not supported with placement rules"); + } } List rules = new ArrayList<>(); @@ -719,7 +764,7 @@ Map identifyNodes(ClusterState clusterState, Map sharVsReplicaCount = new HashMap<>(); - for (String shard : shardNames) sharVsReplicaCount.put(shard, repFactor); + for (String shard : shardNames) sharVsReplicaCount.put(shard, numRealtimeReplicas); ReplicaAssigner replicaAssigner = new ReplicaAssigner(rules, sharVsReplicaCount, (List) message.get(SNITCH), @@ -750,6 +795,8 @@ Map waitToSeeReplicasInState(String collectionName, Collection< if (result.size() == coreNames.size()) { return result; + } else { + log.debug("Expecting {} cores but found {}", coreNames.size(), result.size()); } if (timeout.hasTimedOut()) { throw new SolrException(ErrorCode.SERVER_ERROR, "Timed out waiting to see all replicas: " + coreNames + " in cluster state."); diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index 496d0826d705..5a952ea0b9d7 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -74,7 +74,7 @@ * between versions in terms of API or back compat behaviour. * @lucene.experimental */ -public class RecoveryStrategy extends Thread implements Closeable { +public class RecoveryStrategy implements Runnable, Closeable { public static class Builder implements NamedListInitializedPlugin { private NamedList args; @@ -118,19 +118,17 @@ public static interface RecoveryListener { private boolean recoveringAfterStartup; private CoreContainer cc; private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest; - private boolean onlyLeaderIndexes; + private final Replica.Type replicaType; protected RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) { this.cc = cc; this.coreName = cd.getName(); this.recoveryListener = recoveryListener; - setName("RecoveryThread-"+this.coreName); zkController = cc.getZkController(); zkStateReader = zkController.getZkStateReader(); baseUrl = zkController.getBaseUrl(); coreZkNodeName = cd.getCloudDescriptor().getCoreNodeName(); - String collection = cd.getCloudDescriptor().getCollectionName(); - onlyLeaderIndexes = zkStateReader.getClusterState().getCollection(collection).getRealtimeReplicas() == 1; + replicaType = cd.getCloudDescriptor().getReplicaType(); } final public int getWaitForUpdatesWithStaleStatePauseMilliSeconds() { @@ -263,7 +261,8 @@ final private void commitOnLeader(String leaderUrl) throws SolrServerException, UpdateRequest ureq = new UpdateRequest(); ureq.setParams(new ModifiableSolrParams()); ureq.getParams().set(DistributedUpdateProcessor.COMMIT_END_POINT, true); - ureq.getParams().set(UpdateParams.OPEN_SEARCHER, onlyLeaderIndexes); +// ureq.getParams().set(UpdateParams.OPEN_SEARCHER, onlyLeaderIndexes);// nocommit: Why do we need to open searcher if "onlyLeaderIndexes"? + ureq.getParams().set(UpdateParams.OPEN_SEARCHER, true); ureq.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, true).process( client); } @@ -297,9 +296,162 @@ final public void run() { MDCLoggingContext.clear(); } } + + final public void doRecovery(SolrCore core) throws KeeperException, InterruptedException { + if (core.getCoreDescriptor().getCloudDescriptor().requiresTransactionLog()) { + doSyncOrReplicateRecovery(core); + } else { + doReplicateOnlyRecovery(core); + } + } + + final private void doReplicateOnlyRecovery(SolrCore core) throws InterruptedException { + boolean successfulRecovery = false; + +// if (core.getUpdateHandler().getUpdateLog() != null) { +// SolrException.log(LOG, "'replicate-only' recovery strategy should only be used if no update logs are present, but this core has one: " +// + core.getUpdateHandler().getUpdateLog()); +// return; +// } + while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though + try { + CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor(); + ZkNodeProps leaderprops = zkStateReader.getLeaderRetry( + cloudDesc.getCollectionName(), cloudDesc.getShardId()); + final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP); + final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP); + + String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName); + + String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName); + + boolean isLeader = leaderUrl.equals(ourUrl); //TODO: We can probably delete most of this code if we say this strategy can only be used for passive replicas + if (isLeader && !cloudDesc.isLeader()) { + throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader."); + } + if (cloudDesc.isLeader()) { + // we are now the leader - no one else must have been suitable + LOG.warn("We have not yet recovered - but we are now the leader!"); + LOG.info("Finished recovery process."); + zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE); + return; + } + + + LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl, + ourUrl); + zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING); + + if (isClosed()) { + LOG.info("Recovery for core {} has been closed", core.getName()); + break; + } + + if (isClosed()) { + LOG.info("Recovery for core {} has been closed", core.getName()); + break; + } + LOG.info("Starting Replication Recovery."); + + try { + LOG.info("Stopping background replicate from leader process"); + zkController.stopReplicationFromLeader(coreName); + replicate(zkController.getNodeName(), core, leaderprops); + + if (isClosed()) { + LOG.info("Recovery for core {} has been closed", core.getName()); + break; + } + + LOG.info("Replication Recovery was successful."); + successfulRecovery = true; + } catch (Exception e) { + SolrException.log(LOG, "Error while trying to recover", e); + } + + } catch (Exception e) { + SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e); + } finally { + if (successfulRecovery) { + LOG.info("Restaring background replicate from leader process"); + zkController.startReplicationFromLeader(coreName, false); + LOG.info("Registering as Active after recovery."); + try { + zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE); + } catch (Exception e) { + LOG.error("Could not publish as ACTIVE after succesful recovery", e); + successfulRecovery = false; + } + + if (successfulRecovery) { + close = true; + recoveryListener.recovered(); + } + } + } + + if (!successfulRecovery) { + // lets pause for a moment and we need to try again... + // TODO: we don't want to retry for some problems? + // Or do a fall off retry... + try { + + if (isClosed()) { + LOG.info("Recovery for core {} has been closed", core.getName()); + break; + } + + LOG.error("Recovery failed - trying again... (" + retries + ")"); + + retries++; + if (retries >= maxRetries) { + SolrException.log(LOG, "Recovery failed - max retries exceeded (" + retries + ")."); + try { + recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor()); + } catch (Exception e) { + SolrException.log(LOG, "Could not publish that recovery failed", e); + } + break; + } + } catch (Exception e) { + SolrException.log(LOG, "An error has occurred during recovery", e); + } + + try { + // Wait an exponential interval between retries, start at 5 seconds and work up to a minute. + // If we're at attempt >= 4, there's no point computing pow(2, retries) because the result + // will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in + // order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m). + double loopCount = retries < 4 ? Math.min(Math.pow(2, retries), 12) : 12; + LOG.info("Wait [{}] seconds before trying to recover again (attempt={})", loopCount, retries); + for (int i = 0; i < loopCount; i++) { + if (isClosed()) { + LOG.info("Recovery for core {} has been closed", core.getName()); + break; // check if someone closed us + } + Thread.sleep(startingRecoveryDelayMilliSeconds); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + LOG.warn("Recovery was interrupted.", e); + close = true; + } + } + + } + + // if replay was skipped (possibly to due pulling a full index from the leader), + // then we still need to update version bucket seeds after recovery + if (successfulRecovery) { + LOG.info("Updating version bucket highest from index after successful recovery."); + core.seedVersionBuckets(); + } + + LOG.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery)); +} // TODO: perhaps make this grab a new core each time through the loop to handle core reloads? - final public void doRecovery(SolrCore core) throws KeeperException, InterruptedException { + final public void doSyncOrReplicateRecovery(SolrCore core) throws KeeperException, InterruptedException { boolean replayed = false; boolean successfulRecovery = false; @@ -311,9 +463,9 @@ final public void doRecovery(SolrCore core) throws KeeperException, InterruptedE core.getCoreDescriptor()); return; } - - // we temporary ignore peersync for realtimeReplicas mode - boolean firstTime = !onlyLeaderIndexes; + + // we temporary ignore peersync for Append replicas + boolean firstTime = replicaType != Replica.Type.APPEND; List recentVersions; try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) { @@ -365,12 +517,12 @@ final public void doRecovery(SolrCore core) throws KeeperException, InterruptedE } } - if (onlyLeaderIndexes) { + if (replicaType == Replica.Type.APPEND) { zkController.stopReplicationFromLeader(coreName); } Future replayFuture = null; - while (!successfulRecovery && !isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though + while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though try { CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor(); ZkNodeProps leaderprops = zkStateReader.getLeaderRetry( @@ -522,8 +674,8 @@ final public void doRecovery(SolrCore core) throws KeeperException, InterruptedE if (successfulRecovery) { LOG.info("Registering as Active after recovery."); try { - if (onlyLeaderIndexes) { - zkController.startReplicationFromLeader(coreName); + if (replicaType == Replica.Type.APPEND) { + zkController.startReplicationFromLeader(coreName, true); } zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE); } catch (Exception e) { @@ -605,7 +757,7 @@ final private Future replay(SolrCore core) if (testing_beforeReplayBufferingUpdates != null) { testing_beforeReplayBufferingUpdates.run(); } - if (onlyLeaderIndexes) { + if (replicaType == Replica.Type.APPEND) { // roll over all updates during buffering to new tlog, make RTG available SolrQueryRequest req = new LocalSolrQueryRequest(core, new ModifiableSolrParams()); diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index 817b371f7362..b0bca44a7ddf 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -49,7 +49,12 @@ public ReplicateFromLeader(CoreContainer cc, String coreName) { this.coreName = coreName; } - public void startReplication() throws InterruptedException { + /** + * Start a replication handler thread that will periodically pull indices from the shard leader + * @param switchTransactionLog if true, ReplicationHandler will rotate the transaction log once + * the replication is done + */ + public void startReplication(boolean switchTransactionLog) throws InterruptedException { try (SolrCore core = cc.getCore(coreName)) { if (core == null) { if (cc.isShutDown()) { @@ -78,20 +83,22 @@ public void startReplication() throws InterruptedException { } replicationProcess = new ReplicationHandler(); - replicationProcess.setPollListener((solrCore, pollSuccess) -> { - if (pollSuccess) { - String commitVersion = getCommitVersion(core); - if (commitVersion == null) return; - if (Long.parseLong(commitVersion) == lastVersion) return; - UpdateLog updateLog = solrCore.getUpdateHandler().getUpdateLog(); - SolrQueryRequest req = new LocalSolrQueryRequest(core, - new ModifiableSolrParams()); - CommitUpdateCommand cuc = new CommitUpdateCommand(req, false); - cuc.setVersion(Long.parseLong(commitVersion)); - updateLog.copyOverOldUpdates(cuc); - lastVersion = Long.parseLong(commitVersion); - } - }); + if (switchTransactionLog) { + replicationProcess.setPollListener((solrCore, pollSuccess) -> { + if (pollSuccess) { + String commitVersion = getCommitVersion(core); + if (commitVersion == null) return; + if (Long.parseLong(commitVersion) == lastVersion) return; + UpdateLog updateLog = solrCore.getUpdateHandler().getUpdateLog(); + SolrQueryRequest req = new LocalSolrQueryRequest(core, + new ModifiableSolrParams()); + CommitUpdateCommand cuc = new CommitUpdateCommand(req, false); + cuc.setVersion(Long.parseLong(commitVersion)); + updateLog.copyOverOldUpdates(cuc); + lastVersion = Long.parseLong(commitVersion); + } + }); + } replicationProcess.init(replicationConfig); replicationProcess.inform(core); } diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java index 5a099e1e6af7..837af7985f09 100644 --- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java @@ -385,7 +385,7 @@ public boolean split(ClusterState clusterState, ZkNodeProps message, NamedList r Map nodeMap = ocmh.identifyNodes(clusterState, new ArrayList<>(clusterState.getLiveNodes()), new ZkNodeProps(collection.getProperties()), - subSlices, repFactor - 1); + subSlices, repFactor - 1, 0, 0); List> replicas = new ArrayList<>((repFactor - 1) * 2); diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index b337bd022cae..55dd9a0ad81e 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -62,6 +62,7 @@ import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.OnReconnect; import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Replica.Type; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkACLProvider; @@ -883,12 +884,17 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov try { // If we're a preferred leader, insert ourselves at the head of the queue boolean joinAtHead = false; - Replica replica = zkStateReader.getClusterState().getReplica(desc.getCloudDescriptor().getCollectionName(), - coreZkNodeName); + Replica replica = zkStateReader.getClusterState().getReplica(collection, coreZkNodeName); if (replica != null) { joinAtHead = replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false); } - joinElection(desc, afterExpiration, joinAtHead); + //TODO WHy would replica be null? + if (replica == null || replica.getType() != Type.PASSIVE) { + joinElection(desc, afterExpiration, joinAtHead); + } else if (replica.getType() == Type.PASSIVE) { + log.debug("Replica {} skipping election because replica is passive", coreZkNodeName); + startReplicationFromLeader(coreName, false); + } } catch (InterruptedException e) { // Restore the interrupted status Thread.currentThread().interrupt(); @@ -905,6 +911,8 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName); log.debug("We are " + ourUrl + " and leader is " + leaderUrl); boolean isLeader = leaderUrl.equals(ourUrl); + Replica.Type replicaType = zkStateReader.getClusterState().getCollection(collection).getReplica(coreZkNodeName).getType(); + assert !(isLeader && replicaType == Type.PASSIVE): "Passive replica became leader!"; try (SolrCore core = cc.getCore(desc.getName())) { @@ -915,8 +923,7 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov // leader election perhaps? UpdateLog ulog = core.getUpdateHandler().getUpdateLog(); - boolean onlyLeaderIndexes = zkStateReader.getClusterState().getCollection(collection).getRealtimeReplicas() == 1; - boolean isReplicaInOnlyLeaderIndexes = onlyLeaderIndexes && !isLeader; + boolean isReplicaInOnlyLeaderIndexes = replicaType == Replica.Type.APPEND && !isLeader; if (isReplicaInOnlyLeaderIndexes) { String commitVersion = ReplicateFromLeader.getCommitVersion(core); if (commitVersion != null) { @@ -944,11 +951,12 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov = checkRecovery(recoverReloadedCores, isLeader, skipRecovery, collection, coreZkNodeName, core, cc, afterExpiration); if (!didRecovery) { if (isReplicaInOnlyLeaderIndexes) { - startReplicationFromLeader(coreName); + startReplicationFromLeader(coreName, true); } publish(desc, Replica.State.ACTIVE); } + core.getCoreDescriptor().getCloudDescriptor().setHasRegistered(true); } @@ -960,14 +968,18 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov } } - public void startReplicationFromLeader(String coreName) throws InterruptedException { + public void startReplicationFromLeader(String coreName, boolean switchTransactionLog) throws InterruptedException { + log.info(coreName + " starting replication from leader"); ReplicateFromLeader replicateFromLeader = new ReplicateFromLeader(cc, coreName); if (replicateFromLeaders.putIfAbsent(coreName, replicateFromLeader) == null) { - replicateFromLeader.startReplication(); + replicateFromLeader.startReplication(switchTransactionLog); + } else { + log.warn("A replicate from leader instance already exists for core {}", coreName); } } public void stopReplicationFromLeader(String coreName) { + log.info(coreName + " stopping replication from leader"); ReplicateFromLeader replicateFromLeader = replicateFromLeaders.remove(coreName); if (replicateFromLeader != null) { replicateFromLeader.stopReplication(); @@ -1191,6 +1203,7 @@ public void publish(final CoreDescriptor cd, final Replica.State state, boolean if (state != Replica.State.DOWN) { final Replica.State lirState = getLeaderInitiatedRecoveryState(collection, shardId, coreNodeName); if (lirState != null) { + assert cd.getCloudDescriptor().getReplicaType() != Replica.Type.PASSIVE; if (state == Replica.State.ACTIVE) { // trying to become active, so leader-initiated state must be recovering if (lirState == Replica.State.RECOVERING) { @@ -1272,12 +1285,16 @@ public void unregister(String coreName, CoreDescriptor cd) throws InterruptedExc assert false : "No collection was specified [" + collection + "]"; return; } + Replica replica = zkStateReader.getClusterState().getReplica(collection, coreNodeName); + + if (replica == null || replica.getType() != Type.PASSIVE) { + ElectionContext context = electionContexts.remove(new ContextKey(collection, coreNodeName)); - ElectionContext context = electionContexts.remove(new ContextKey(collection, coreNodeName)); - - if (context != null) { - context.cancelElection(); + if (context != null) { + context.cancelElection(); + } } +// //TODO: Do we need to stop replication for type==append? CloudDescriptor cloudDescriptor = cd.getCloudDescriptor(); zkStateReader.unregisterCore(cloudDescriptor.getCollectionName()); @@ -1360,6 +1377,7 @@ private void waitForShardId(CoreDescriptor cd) { final String shardId = zkStateReader.getClusterState().getShardId(cd.getCollectionName(), getNodeName(), cd.getName()); if (shardId != null) { cd.getCloudDescriptor().setShardId(shardId); + log.debug("Shard ID is {} for core {} ", shardId, cd.getName()); return; } try { @@ -2407,11 +2425,9 @@ public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) { for (Slice slice : slices) { Collection replicas = slice.getReplicas(); - for (Replica replica : replicas) { - if (replica.getName().equals( - dcore.getCloudDescriptor().getCoreNodeName())) { - return true; - } + Replica r = slice.getReplica(dcore.getCloudDescriptor().getCoreNodeName()); + if (r != null) { + return true; } } } diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java index f03eeeb4e3a5..c467405c9e59 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java @@ -271,11 +271,12 @@ private ZkWriteCommand updateState(final ClusterState prevState, ZkNodeProps mes replicaProps.putAll(message.getProperties()); if (slice != null) { - Replica oldReplica = slice.getReplicasMap().get(coreNodeName); + Replica oldReplica = slice.getReplica(coreNodeName); if (oldReplica != null) { if (oldReplica.containsKey(ZkStateReader.LEADER_PROP)) { replicaProps.put(ZkStateReader.LEADER_PROP, oldReplica.get(ZkStateReader.LEADER_PROP)); } + replicaProps.put(ZkStateReader.REPLICA_TYPE, oldReplica.getType().toString()); // Move custom props over. for (Map.Entry ent : oldReplica.getProperties().entrySet()) { if (ent.getKey().startsWith(COLL_PROP_PREFIX)) { @@ -311,6 +312,8 @@ private ZkWriteCommand updateState(final ClusterState prevState, ZkNodeProps mes Replica replica = new Replica(coreNodeName, replicaProps); + + log.debug("Will update state for replica: " + replica); Map sliceProps = null; Map replicas; @@ -328,11 +331,11 @@ private ZkWriteCommand updateState(final ClusterState prevState, ZkNodeProps mes sliceProps.put(ZkStateReader.STATE_PROP, shardState); sliceProps.put(Slice.PARENT, shardParent); } - replicas.put(replica.getName(), replica); slice = new Slice(sliceName, replicas, sliceProps); DocCollection newCollection = CollectionMutator.updateSlice(collectionName, collection, slice); + log.debug("Collection is now: " + newCollection); return new ZkWriteCommand(collectionName, newCollection); } diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java index ec2ce2e6b4db..b1e969197a5d 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java @@ -16,14 +16,16 @@ */ package org.apache.solr.cloud.overseer; -import java.lang.invoke.MethodHandles; +import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_PROP_PREFIX; +import static org.apache.solr.cloud.overseer.CollectionMutator.checkCollectionKeyExistence; +import static org.apache.solr.common.util.Utils.makeMap; +import java.lang.invoke.MethodHandles; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; -import com.google.common.collect.ImmutableSet; import org.apache.solr.cloud.Assign; import org.apache.solr.cloud.Overseer; import org.apache.solr.common.cloud.ClusterState; @@ -37,9 +39,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_PROP_PREFIX; -import static org.apache.solr.cloud.overseer.CollectionMutator.checkCollectionKeyExistence; -import static org.apache.solr.common.util.Utils.makeMap; +import com.google.common.collect.ImmutableSet; public class SliceMutator { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -66,14 +66,16 @@ public ZkWriteCommand addReplica(ClusterState clusterState, ZkNodeProps message) log.error("Invalid Collection/Slice {}/{} ", coll, slice); return ZkStateWriter.NO_OP; } - String coreNodeName = Assign.assignNode(collection); +// Replica replica = new Replica(coreNodeName, + // coreNodeName overlaps? Replica replica = new Replica(coreNodeName, makeMap( ZkStateReader.CORE_NAME_PROP, message.getStr(ZkStateReader.CORE_NAME_PROP), ZkStateReader.BASE_URL_PROP, message.getStr(ZkStateReader.BASE_URL_PROP), ZkStateReader.STATE_PROP, message.getStr(ZkStateReader.STATE_PROP), - ZkStateReader.NODE_NAME_PROP, message.getStr(ZkStateReader.NODE_NAME_PROP))); + ZkStateReader.NODE_NAME_PROP, message.getStr(ZkStateReader.NODE_NAME_PROP), + ZkStateReader.REPLICA_TYPE, message.get(ZkStateReader.REPLICA_TYPE))); return new ZkWriteCommand(coll, updateReplica(collection, sl, replica.getName(), replica)); } @@ -248,13 +250,15 @@ public ZkWriteCommand removeRoutingRule(final ClusterState clusterState, ZkNodeP } public static DocCollection updateReplica(DocCollection collection, final Slice slice, String coreNodeName, final Replica replica) { - Map copy = slice.getReplicasCopy(); + Map replicasCopy = slice.getReplicasCopy(); if (replica == null) { - copy.remove(coreNodeName); + replicasCopy.remove(coreNodeName); } else { - copy.put(replica.getName(), replica); + replicasCopy.put(replica.getName(), replica); } - Slice newSlice = new Slice(slice.getName(), copy, slice.getProperties()); + Slice newSlice = new Slice(slice.getName(), replicasCopy, slice.getProperties()); + log.info("Old Slice: " + slice); + log.info("New Slice: " + newSlice); return CollectionMutator.updateSlice(collection.getName(), collection, newSlice); } } diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java index 23fb56cb1876..a906b86d8465 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java @@ -268,6 +268,7 @@ public ClusterState writePendingUpdates() throws IllegalStateException, KeeperEx } } + log.debug("New Cluster State is: " + clusterState); return clusterState; } diff --git a/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java b/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java index 3eab8b49fc76..ebe29f1128f1 100644 --- a/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java +++ b/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java @@ -62,10 +62,12 @@ public class ReplicaAssigner { public static class Position implements Comparable { public final String shard; public final int index; + public final Replica.Type type; - public Position(String shard, int replicaIdx) { + public Position(String shard, int replicaIdx, Replica.Type type) { this.shard = shard; this.index = replicaIdx; + this.type = type; } @Override @@ -188,7 +190,7 @@ private Map tryAllPermutations(List shardNames, List positions = new ArrayList<>(); for (int pos : p) { for (int j = 0; j < shardVsReplicaCount.get(shardNames.get(pos)); j++) { - positions.add(new Position(shardNames.get(pos), j)); + positions.add(new Position(shardNames.get(pos), j, Replica.Type.REALTIME)); } } Collections.sort(positions); diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index 28c1eafee3f0..a4f15f7f3f83 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -16,6 +16,17 @@ */ package org.apache.solr.core; +import static java.util.Objects.requireNonNull; +import static org.apache.solr.common.params.CommonParams.AUTHC_PATH; +import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH; +import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH; +import static org.apache.solr.common.params.CommonParams.METRICS_PATH; +import static org.apache.solr.common.params.CommonParams.ZK_PATH; +import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP; + import java.io.IOException; import java.lang.invoke.MethodHandles; import java.nio.file.Path; @@ -56,6 +67,7 @@ import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica.State; +import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.util.ExecutorUtil; import org.apache.solr.common.util.IOUtils; import org.apache.solr.common.util.Utils; @@ -95,17 +107,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static java.util.Objects.requireNonNull; -import static org.apache.solr.common.params.CommonParams.AUTHC_PATH; -import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH; -import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH; -import static org.apache.solr.common.params.CommonParams.METRICS_PATH; -import static org.apache.solr.common.params.CommonParams.ZK_PATH; import static org.apache.solr.core.CorePropertiesLocator.PROPERTIES_FILENAME; -import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP; /** * @@ -1184,10 +1186,15 @@ public void reload(String name) { SolrCore newCore = core.reload(coreConfig); registerCore(cd, newCore, false, false); if (getZkController() != null) { - boolean onlyLeaderIndexes = getZkController().getClusterState().getCollection(cd.getCollectionName()).getRealtimeReplicas() == 1; - if (onlyLeaderIndexes && !cd.getCloudDescriptor().isLeader()) { + DocCollection docCollection = getZkController().getClusterState().getCollection(cd.getCollectionName()); + Replica replica = docCollection.getReplica(cd.getCloudDescriptor().getCoreNodeName()); + assert replica != null; + if (replica.getType() == Replica.Type.APPEND) { //TODO: needed here? getZkController().stopReplicationFromLeader(core.getName()); - getZkController().startReplicationFromLeader(newCore.getName()); + if (!cd.getCloudDescriptor().isLeader()) { + getZkController().startReplicationFromLeader(newCore.getName(), true); + } + } } } catch (SolrCoreState.CoreIsClosedException e) { @@ -1273,6 +1280,10 @@ public void unload(String name, boolean deleteIndexDir, boolean deleteDataDir, b if (zkSys.getZkController() != null) { // cancel recovery in cloud mode core.getSolrCoreState().cancelRecovery(); + if (core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.PASSIVE) { // TODO: Also for Replica.Type.ACTIVE? + // Stop replication if this is part of a passive replica before closing the code + zkSys.getZkController().stopReplicationFromLeader(name); + } } core.unloadOnClose(cd, deleteIndexDir, deleteDataDir, deleteInstanceDir); diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java index 96e505a86d97..84b36b5bb0d6 100644 --- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java +++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java @@ -352,17 +352,28 @@ IndexFetchResult fetchLatestIndex(boolean forceReplication, boolean forceCoreRel // when we are a bit more confident we may want to try a partial replication // if the error is connection related or something, but we have to be careful forceReplication = true; + LOG.info("Last replication failed, so I'll force replication"); } try { if (fetchFromLeader) { + assert !solrCore.isClosed(): "Replication should be stopped before closing the core"; Replica replica = getLeaderReplica(); CloudDescriptor cd = solrCore.getCoreDescriptor().getCloudDescriptor(); if (cd.getCoreNodeName().equals(replica.getName())) { return IndexFetchResult.EXPECTING_NON_LEADER; } - masterUrl = replica.getCoreUrl(); - LOG.info("Updated masterUrl to " + masterUrl); + if (replica.getState() != Replica.State.ACTIVE) { + LOG.info("Replica {} is leader but it's state is {}, skipping replication", replica.getName(), replica.getState()); + return IndexFetchResult.EXPECTING_NON_LEADER;//nocommit: not the correct error + } + if (!replica.getCoreUrl().equals(masterUrl)) { + masterUrl = replica.getCoreUrl(); + LOG.info("Updated masterUrl to {}", masterUrl); + // TODO: Do we need to set forceReplication = true? + } else { + LOG.debug("masterUrl didn't change"); + } } //get the current 'replicateable' index version in the master NamedList response; @@ -410,6 +421,7 @@ IndexFetchResult fetchLatestIndex(boolean forceReplication, boolean forceCoreRel if (forceReplication && commit.getGeneration() != 0) { // since we won't get the files for an empty index, // we just clear ours and commit + LOG.info("New index in Master. Deleting mine..."); RefCounted iw = solrCore.getUpdateHandler().getSolrCoreState().getIndexWriter(solrCore); try { iw.get().deleteAll(); @@ -422,6 +434,7 @@ IndexFetchResult fetchLatestIndex(boolean forceReplication, boolean forceCoreRel //there is nothing to be replicated successfulInstall = true; + LOG.debug("Nothing to replicate, master's version is 0"); return IndexFetchResult.MASTER_VERSION_ZERO; } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index d5c49274d747..e657b62b9e3f 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -113,15 +113,7 @@ import static org.apache.solr.common.cloud.DocCollection.RULE; import static org.apache.solr.common.cloud.DocCollection.SNITCH; import static org.apache.solr.common.cloud.DocCollection.STATE_FORMAT; -import static org.apache.solr.common.cloud.ZkStateReader.AUTO_ADD_REPLICAS; -import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE; -import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; -import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_VALUE_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; -import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.*; import static org.apache.solr.common.params.CollectionAdminParams.COUNT_PROP; import static org.apache.solr.common.params.CollectionParams.CollectionAction.*; import static org.apache.solr.common.params.CommonAdminParams.ASYNC; @@ -407,6 +399,8 @@ enum CollectionOperation implements CollectionOp { AUTO_ADD_REPLICAS, RULE, SNITCH, + PASSIVE_REPLICAS, + APPEND_REPLICAS, REALTIME_REPLICAS); if (props.get(STATE_FORMAT) == null) { @@ -634,7 +628,8 @@ public Map execute(SolrQueryRequest req, SolrQueryResponse rsp, _ROUTE_, CoreAdminParams.NAME, INSTANCE_DIR, - DATA_DIR); + DATA_DIR, + REPLICA_TYPE); return copyPropertiesWithPrefix(req.getParams(), props, COLL_PROP_PREFIX); }), OVERSEERSTATUS_OP(OVERSEERSTATUS, (req, rsp, h) -> (Map) new LinkedHashMap<>()), diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 67463327e2f8..31ad4f78a3dd 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -171,7 +171,9 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw final CallInfo callInfo = new CallInfo(this, req, rsp, op); if (taskId == null) { + log.info("Starting Operation: " + req); callInfo.call(); + log.info("Done with Operation: " + req); } else { try { MDC.put("CoreAdminHandler.asyncId", taskId); @@ -227,6 +229,7 @@ protected void handleCustomAction(SolrQueryRequest req, SolrQueryResponse rsp) { .put(CoreAdminParams.ROLES, CoreDescriptor.CORE_ROLES) .put(CoreAdminParams.CORE_NODE_NAME, CoreDescriptor.CORE_NODE_NAME) .put(ZkStateReader.NUM_SHARDS_PROP, CloudDescriptor.NUM_SHARDS) + .put(CoreAdminParams.REPLICA_TYPE, CloudDescriptor.REPLICA_TYPE) .build(); protected static Map buildCoreParams(SolrParams params) { diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java index 0c2c9039ddec..8e5408b7a963 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java @@ -18,6 +18,7 @@ package org.apache.solr.handler.admin; import java.lang.invoke.MethodHandles; +import java.util.Objects; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.solr.cloud.CloudDescriptor; @@ -175,7 +176,7 @@ public void execute(CallInfo it) throws Exception { "I was asked to wait on state " + waitForState + " for " + shardId + " in " + collection + " on " + nodeName + " but I still do not see the requested state. I see state: " - + state.toString() + " live:" + live + " leader from ZK: " + leaderInfo + + Objects.toString(state) + " live:" + live + " leader from ZK: " + leaderInfo ); } diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java index c0ceddb8d5f5..c4ee76b0476e 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java @@ -106,6 +106,11 @@ public void process(ResponseBuilder rb) throws IOException SolrQueryResponse rsp = rb.rsp; SolrParams params = req.getParams(); + if (req.getCore().getCoreDescriptor().getCloudDescriptor() != null + && !req.getCore().getCoreDescriptor().getCloudDescriptor().requiresTransactionLog()) { + return; + } + if (!params.getBool(COMPONENT_NAME, true)) { return; } diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java index bc2afa879c34..f19b2dfb0e65 100644 --- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java +++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java @@ -16,6 +16,7 @@ */ package org.apache.solr.update; +import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.concurrent.ExecutionException; @@ -32,8 +33,8 @@ import org.apache.lucene.search.Sort; import org.apache.solr.cloud.ActionThrottle; import org.apache.solr.cloud.RecoveryStrategy; -import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.DirectoryFactory; @@ -65,7 +66,8 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover private DirectoryFactory directoryFactory; private final RecoveryStrategy.Builder recoveryStrategyBuilder; - private volatile RecoveryStrategy recoveryStrat; + private volatile RecoveryStrategy recoveryStrat; //nocommit: Make interface +// private volatile Thread recoveryStrat; private volatile boolean lastReplicationSuccess = true; @@ -365,9 +367,11 @@ public void run() { public void cancelRecovery() { if (recoveryStrat != null) { try { - recoveryStrat.close(); + ((Closeable)recoveryStrat).close(); } catch (NullPointerException e) { // okay + } catch (IOException e) { + // okay } } } diff --git a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java index dd179f22ade5..ec704513f93e 100644 --- a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java +++ b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java @@ -45,7 +45,7 @@ import org.apache.lucene.util.BytesRefHash; import org.apache.solr.cloud.ZkController; import org.apache.solr.common.SolrException; -import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.core.SolrConfig.UpdateHandlerInfo; import org.apache.solr.core.SolrCore; @@ -122,12 +122,9 @@ public DirectUpdateHandler2(SolrCore core) { indexWriterCloseWaitsForMerges = updateHandlerInfo.indexWriterCloseWaitsForMerges; ZkController zkController = core.getCoreContainer().getZkController(); - if (zkController != null) { - DocCollection dc = zkController.getClusterState().getCollection(core.getCoreDescriptor().getCollectionName()); - if (dc.getRealtimeReplicas() == 1) { - commitWithinSoftCommit = false; - commitTracker.setOpenSearcher(true); - } + if (zkController != null && core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.APPEND) { + commitWithinSoftCommit = false; + commitTracker.setOpenSearcher(true); } } @@ -248,7 +245,7 @@ private int addDoc0(AddUpdateCommand cmd) throws IOException { cmd.overwrite = false; } try { - if ( (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) != 0) { + if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) != 0) { if (ulog != null) ulog.add(cmd); return 1; } @@ -424,7 +421,7 @@ public void delete(DeleteUpdateCommand cmd) throws IOException { deleteByIdCommands.increment(); deleteByIdCommandsCumulative.mark(); - if ( (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) != 0 ) { + if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) != 0 ) { if (ulog != null) ulog.delete(cmd); return; } @@ -488,7 +485,7 @@ public void deleteByQuery(DeleteUpdateCommand cmd) throws IOException { deleteByQueryCommandsCumulative.mark(); boolean madeIt=false; try { - if ( (cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) != 0) { + if ((cmd.getFlags() & UpdateCommand.IGNORE_INDEXWRITER) != 0) { if (ulog != null) ulog.deleteByQuery(cmd); madeIt = true; return; @@ -547,7 +544,6 @@ public void deleteByQuery(DeleteUpdateCommand cmd) throws IOException { } } - @Override public int mergeIndexes(MergeIndexesCommand cmd) throws IOException { mergeIndexesCommands.mark(); @@ -920,7 +916,7 @@ public void split(SplitIndexCommand cmd) throws IOException { * Calls either {@link IndexWriter#updateDocValues} or {@link IndexWriter#updateDocument} as * needed based on {@link AddUpdateCommand#isInPlaceUpdate}. *

- * If the this is an UPDATE_INPLACE cmd, then all fields inclued in + * If the this is an UPDATE_INPLACE cmd, then all fields included in * {@link AddUpdateCommand#getLuceneDocument} must either be the uniqueKey field, or be DocValue * only fields. *

diff --git a/solr/core/src/java/org/apache/solr/update/UpdateCommand.java b/solr/core/src/java/org/apache/solr/update/UpdateCommand.java index b124271d977d..6c0fc50217bc 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateCommand.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateCommand.java @@ -34,6 +34,7 @@ public abstract class UpdateCommand implements Cloneable { public static int PEER_SYNC = 0x00000004; // update command is a missing update being provided by a peer. public static int IGNORE_AUTOCOMMIT = 0x00000008; // this update should not count toward triggering of autocommits. public static int CLEAR_CACHES = 0x00000010; // clear caches associated with the update log. used when applying reordered DBQ updates when doing an add. + // TODO: rename to something like "APPEND_REPLICAS_IGNORE_IW", or maybe just "FROM_LEADER"? public static int IGNORE_INDEXWRITER = 0x00000020; public UpdateCommand(SolrQueryRequest req) { diff --git a/solr/core/src/java/org/apache/solr/update/UpdateHandler.java b/solr/core/src/java/org/apache/solr/update/UpdateHandler.java index 49d2664c6494..5c6e33b82dce 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateHandler.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateHandler.java @@ -115,7 +115,7 @@ protected void callPostOptimizeCallbacks() { public UpdateHandler(SolrCore core) { this(core, null); } - + public UpdateHandler(SolrCore core, UpdateLog updateLog) { this.core=core; idField = core.getLatestSchema().getUniqueKeyField(); @@ -124,7 +124,9 @@ public UpdateHandler(SolrCore core, UpdateLog updateLog) { PluginInfo ulogPluginInfo = core.getSolrConfig().getPluginInfo(UpdateLog.class.getName()); - if (updateLog == null && ulogPluginInfo != null && ulogPluginInfo.isEnabled()) { + // If this is a replica of type passive, don't create the update log + boolean skipUpdateLog = core.getCoreDescriptor().getCloudDescriptor() != null && !core.getCoreDescriptor().getCloudDescriptor().requiresTransactionLog(); + if (updateLog == null && ulogPluginInfo != null && !skipUpdateLog) { String dataDir = (String)ulogPluginInfo.initArgs.get("dir"); String ulogDir = core.getCoreDescriptor().getUlogDir(); diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java index c50add4a45e3..ddacb1971b43 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java @@ -1706,7 +1706,7 @@ public void run() { public void doReplay(TransactionLog translog) { try { - loglog.warn("Starting log replay " + translog + " active=" + activeLog + " starting pos=" + recoveryInfo.positionOfStart); + loglog.warn("Starting log replay " + translog + " active=" + activeLog + " starting pos=" + recoveryInfo.positionOfStart + " inSortedOrder=" + inSortedOrder); long lastStatusTime = System.nanoTime(); if (inSortedOrder) { tlogReader = translog.getSortedReader(recoveryInfo.positionOfStart); @@ -1786,7 +1786,7 @@ public void doReplay(TransactionLog translog) { recoveryInfo.adds++; AddUpdateCommand cmd = convertTlogEntryToAddUpdateCommand(req, entry, oper, version); cmd.setFlags(UpdateCommand.REPLAY | UpdateCommand.IGNORE_AUTOCOMMIT); - log.debug("{} {}", oper == ADD ? "add" : "update", cmd); + if (debug) log.debug("{} {}", oper == ADD ? "add" : "update", cmd); proc.processAdd(cmd); break; } @@ -1854,6 +1854,7 @@ public void doReplay(TransactionLog translog) { // something wrong with the request? } assert TestInjection.injectUpdateLogReplayRandomPause(); + } CommitUpdateCommand cmd = new CommitUpdateCommand(req, false); diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java index cb1b2fb68c6c..e9f63d507024 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java @@ -25,6 +25,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -279,7 +280,7 @@ public String toString() { // this is set to true in the constructor if the next processors in the chain // are custom and may modify the SolrInputDocument racing with its serialization for replication private final boolean cloneRequiredOnLeader; - private final boolean onlyLeaderIndexes; + private final Replica.Type replicaType; public DistributedUpdateProcessor(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { this(req, rsp, new AtomicUpdateDocumentMerger(req), next); @@ -324,12 +325,10 @@ public DistributedUpdateProcessor(SolrQueryRequest req, if (cloudDesc != null) { collection = cloudDesc.getCollectionName(); - ClusterState cstate = zkController.getClusterState(); - DocCollection coll = cstate.getCollection(collection); - onlyLeaderIndexes = coll.getRealtimeReplicas() == 1; + replicaType = cloudDesc.getReplicaType(); } else { collection = null; - onlyLeaderIndexes = false; + replicaType = Replica.Type.REALTIME; } boolean shouldClone = false; @@ -666,7 +665,7 @@ private void doDefensiveChecks(DistribPhase phase) { // used for deleteByQuery to get the list of nodes this leader should forward to - private List setupRequest() { + private List setupRequestForDBQ() { List nodes = null; String shardId = cloudDesc.getShardId(); @@ -680,7 +679,7 @@ private List setupRequest() { forwardToLeader = false; List replicaProps = zkController.getZkStateReader() - .getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN); + .getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.REALTIME, Replica.Type.APPEND)); if (replicaProps != null) { nodes = new ArrayList<>(replicaProps.size()); for (ZkCoreNodeProps props : replicaProps) { @@ -1190,7 +1189,7 @@ protected boolean versionAdd(AddUpdateCommand cmd) throws IOException { checkDeleteByQueries = true; } } - if (onlyLeaderIndexes && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (replicaType == Replica.Type.APPEND && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { cmd.setFlags(cmd.getFlags() | UpdateCommand.IGNORE_INDEXWRITER); } } @@ -1576,7 +1575,7 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { if (zkEnabled && DistribPhase.TOLEADER == phase) { // This core should be a leader isLeader = true; - replicas = setupRequest(); + replicas = setupRequestForDBQ(); } else if (DistribPhase.FROMLEADER == phase) { isLeader = false; } @@ -1610,8 +1609,9 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId(); Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry( collection, myShardId); + // DBQ forwarded to Realtime and Append List replicaProps = zkController.getZkStateReader() - .getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN); + .getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.REALTIME, Replica.Type.APPEND)); if (replicaProps != null) { final List myReplicas = new ArrayList<>(replicaProps.size()); for (ZkCoreNodeProps replicaProp : replicaProps) { @@ -1699,10 +1699,10 @@ protected void versionDeleteByQuery(DeleteUpdateCommand cmd) throws IOException return; } - if (onlyLeaderIndexes && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (replicaType == Replica.Type.APPEND && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + // Append replica not leader, don't write the DBQ to IW cmd.setFlags(cmd.getFlags() | UpdateCommand.IGNORE_INDEXWRITER); } - doLocalDelete(cmd); } } @@ -1857,7 +1857,7 @@ protected boolean versionDelete(DeleteUpdateCommand cmd) throws IOException { } } - if (onlyLeaderIndexes && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (replicaType == Replica.Type.APPEND && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { cmd.setFlags(cmd.getFlags() | UpdateCommand.IGNORE_INDEXWRITER); } } @@ -1884,14 +1884,14 @@ public void processCommit(CommitUpdateCommand cmd) throws IOException { zkCheck(); nodes = getCollectionUrls(req, req.getCore().getCoreDescriptor() - .getCloudDescriptor().getCollectionName()); + .getCloudDescriptor().getCollectionName(), EnumSet.of(Replica.Type.APPEND,Replica.Type.REALTIME)); if (isLeader && nodes.size() == 1) { singleLeader = true; } } if (!zkEnabled || req.getParams().getBool(COMMIT_END_POINT, false) || singleLeader) { - if (onlyLeaderIndexes) { + if (replicaType == Replica.Type.APPEND) { // REALTIME will always commit try { Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry( collection, cloudDesc.getShardId()); @@ -1904,7 +1904,7 @@ public void processCommit(CommitUpdateCommand cmd) throws IOException { doLocalCommit(cmd); } else { assert TestInjection.waitForInSyncWithLeader(req.getCore(), - zkController, collection, cloudDesc.getShardId()); + zkController, collection, cloudDesc.getShardId()): "Core " + req.getCore() + " not in sync with leader"; } } catch (InterruptedException e) { throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + cloudDesc.getShardId(), e); @@ -1958,7 +1958,7 @@ public void finish() throws IOException { - private List getCollectionUrls(SolrQueryRequest req, String collection) { + private List getCollectionUrls(SolrQueryRequest req, String collection, EnumSet types) { ClusterState clusterState = req.getCore() .getCoreContainer().getZkController().getClusterState(); Map slices = clusterState.getSlicesMap(collection); @@ -1973,6 +1973,10 @@ private List getCollectionUrls(SolrQueryRequest req, String collection) { Map shardMap = replicas.getReplicasMap(); for (Entry entry : shardMap.entrySet()) { + if (!types.contains(entry.getValue().getType())) { + log.info("getCollectionUrls: Skipping replica " + entry.getValue().getName());//nocommit: too verbose + continue; + } ZkCoreNodeProps nodeProps = new ZkCoreNodeProps(entry.getValue()); if (clusterState.liveNodesContain(nodeProps.getNodeName())) { urls.add(new StdNode(nodeProps, collection, replicas.getName())); diff --git a/solr/core/src/test-files/log4j.properties b/solr/core/src/test-files/log4j.properties index 26972038f9c5..c464a9fd9def 100644 --- a/solr/core/src/test-files/log4j.properties +++ b/solr/core/src/test-files/log4j.properties @@ -1,5 +1,5 @@ # Logging level -log4j.rootLogger=INFO, CONSOLE +log4j.rootLogger=DEBUG, CONSOLE log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender log4j.appender.CONSOLE.Target=System.err @@ -10,6 +10,12 @@ log4j.logger.org.apache.hadoop=WARN log4j.logger.org.apache.directory=WARN log4j.logger.org.apache.solr.hadoop=INFO +log4j.logger.org.apache.solr.cloud.OverseerTaskProcessor=INFO +log4j.logger.org.apache.solr.cloud.OverseerTaskQueue=INFO +log4j.logger.org.apache.solr.cloud.OverseerTaskQueue=INFO +log4j.logger.org.apache.solr.common.cloud.SolrZkClient=INFO +log4j.logger.org.apache.solr.util.stats.InstrumentedPoolingHttpClientConnectionManager=INFO +log4j.logger.com.codehale.metrics=INFO #log4j.logger.org.apache.solr.update.processor.LogUpdateProcessorFactory=DEBUG #log4j.logger.org.apache.solr.update.processor.DistributedUpdateProcessor=DEBUG #log4j.logger.org.apache.solr.update.PeerSync=DEBUG @@ -31,6 +37,6 @@ log4j.logger.org.apache.solr.hadoop=INFO #log4j.logger.org.apache.http.impl.conn.PoolingHttpClientConnectionManager=DEBUG #log4j.logger.org.apache.http.impl.conn.BasicClientConnectionManager=DEBUG -#log4j.logger.org.apache.http=DEBUG +log4j.logger.org.apache.http=INFO #log4j.logger.org.apache.solr.client.solrj.impl.SolrHttpRequestRetryHandler=DEBUG -#log4j.logger.org.eclipse.jetty=DEBUG +log4j.logger.org.eclipse.jetty=INFO diff --git a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/solrconfig.xml b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/solrconfig.xml index 059e58f447c2..8da7d2847e9b 100644 --- a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/solrconfig.xml +++ b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/solrconfig.xml @@ -33,7 +33,7 @@ ${solr.commitwithin.softcommit:true} - + diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java index 5eb4b3b35d9d..51f9fe9301fd 100644 --- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java +++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java @@ -62,8 +62,8 @@ public BasicDistributedZk2Test() { } @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1 : -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } @Test diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java index 1c23c9cf678e..8d0839f018e5 100644 --- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java @@ -119,8 +119,8 @@ public BasicDistributedZkTest() { } @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1 : -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } @Override diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java index ffc5262a9914..a389005cb66a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java @@ -112,8 +112,8 @@ public ChaosMonkeyNothingIsSafeTest() { } @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1 : -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } @Test diff --git a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java index ed9ed41b011b..e75a8547fdc5 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java @@ -286,7 +286,7 @@ public void testCreateShouldFailOnExistingCore() throws Exception { // first we make a core with the core name the collections api // will try and use - this will cause our mock fail Create createCmd = new Create(); - createCmd.setCoreName("halfcollection_shard1_replica1"); + createCmd.setCoreName(Assign.buildCoreName("halfcollection", "shard1", Replica.Type.REALTIME, 1)); createCmd.setCollection("halfcollectionblocker"); String dataDir = createTempDir().toFile().getAbsolutePath(); createCmd.setDataDir(dataDir); @@ -298,7 +298,7 @@ public void testCreateShouldFailOnExistingCore() throws Exception { } createCmd = new Create(); - createCmd.setCoreName("halfcollection_shard1_replica1"); + createCmd.setCoreName(Assign.buildCoreName("halfcollection", "shard1", Replica.Type.REALTIME, 1)); createCmd.setCollection("halfcollectionblocker2"); dataDir = createTempDir().toFile().getAbsolutePath(); createCmd.setDataDir(dataDir); diff --git a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java index 3e0d84034453..e2a80b69741d 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java @@ -62,7 +62,7 @@ public void testCreateAndDeleteCollection() throws Exception { Map> coresStatus = response.getCollectionCoresStatus(); assertEquals(4, coresStatus.size()); for (int i=0; i<4; i++) { - NamedList status = coresStatus.get(collectionName + "_shard" + (i/2+1) + "_replica" + (i%2+1)); + NamedList status = coresStatus.get(Assign.buildCoreName(collectionName, "shard" + (i/2+1), Replica.Type.REALTIME, (i%2+1))); assertEquals(0, (int)status.get("status")); assertTrue(status.get("QTime") > 0); } diff --git a/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java b/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java index 8904ea827be2..db9ecb4769d7 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java @@ -58,8 +58,8 @@ public class ForceLeaderTest extends HttpPartitionTest { private final boolean onlyLeaderIndexes = random().nextBoolean(); @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1 : -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } @Test diff --git a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java index 01002cfee44c..d0b0c5eaa83b 100644 --- a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java @@ -85,8 +85,8 @@ public HttpPartitionTest() { } @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1 : -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } /** diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java index 457b9d9ef850..fd1b40343fd5 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java @@ -46,8 +46,8 @@ public LeaderInitiatedRecoveryOnCommitTest() { } @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1 : -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } @Override diff --git a/solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java b/solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java index a4e8d6f2bc43..629740816c36 100644 --- a/solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.EnumSet; import java.util.List; import java.util.concurrent.Semaphore; @@ -30,9 +31,11 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.CollectionStatePredicate; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; @@ -44,6 +47,7 @@ import org.apache.solr.update.UpdateHandler; import org.apache.solr.update.UpdateLog; import org.apache.solr.util.RefCounted; +import org.apache.zookeeper.KeeperException; import org.junit.BeforeClass; import org.junit.Test; @@ -59,10 +63,13 @@ public static void setupCluster() throws Exception { .addConfig("config", TEST_PATH().resolve("configsets") .resolve("cloud-minimal-inplace-updates").resolve("conf")) .configure(); + + CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false"); + CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient()); + assertEquals(0, response.getStatus()); CollectionAdminRequest - .createCollection(COLLECTION, "config", 1, 3) - .setRealtimeReplicas(1) + .createCollection(COLLECTION, "config", 1, 0, 3, 0) .setMaxShardsPerNode(1) .process(cluster.getSolrClient()); AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), @@ -71,6 +78,7 @@ public static void setupCluster() throws Exception { @Test public void test() throws Exception { + assertNumberOfReplicas(0, 3, 0, false, true); basicTest(); recoveryTest(); dbiTest(); @@ -252,6 +260,7 @@ public void basicLeaderElectionTest() throws Exception { .add(sdoc("id", "4")) .process(cloudClient, COLLECTION); ChaosMonkey.start(oldLeaderJetty); + waitForState("Replica not removed", "collection1", activeReplicaCount(0, 3, 0)); AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), false, true, 60); checkRTG(1,4, cluster.getJettySolrRunners()); @@ -347,7 +356,7 @@ private void checkRTG(int from, int to, List solrRunners) throw } private void checkShardConsistency(int expected, int numTry) throws Exception{ - + String replicaNotInSync = null; for (int i = 0; i < numTry; i++) { boolean inSync = true; for (JettySolrRunner solrRunner: cluster.getJettySolrRunners()) { @@ -357,15 +366,16 @@ private void checkShardConsistency(int expected, int numTry) throws Exception{ long results = client.query(COLLECTION, query).getResults().getNumFound(); if (expected != results) { inSync = false; - Thread.sleep(500); + replicaNotInSync = solrRunner.getNodeName(); break; } } } if (inSync) return; + Thread.sleep(500); } - fail("Some replicas are not in sync with leader"); + fail("Some replicas are not in sync with leader: " + replicaNotInSync); } private void waitForReplicasCatchUp(int numTry) throws IOException, InterruptedException { @@ -431,5 +441,48 @@ private List getSolrRunner(boolean isLeader) { } return rs; } + + // TODO: This is copy/paste from TestPassiveReplica, refactor + private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int numPassive, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { + if (updateCollection) { + cluster.getSolrClient().getZkStateReader().forceUpdateCollection("collection1"); + } + DocCollection docCollection = getCollectionState("collection1"); + assertNotNull(docCollection); + assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, + docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, + docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of active replicas: " + docCollection, numActive, + docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + return docCollection; + } + + private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive, int numPassive) { + return (liveNodes, collectionState) -> { + int writersFound = 0, activesFound = 0, passivesFound = 0; + if (collectionState == null) + return false; + for (Slice slice : collectionState) { + for (Replica replica : slice) { + if (replica.isActive(liveNodes)) + switch (replica.getType()) { + case APPEND: + activesFound++; + break; + case PASSIVE: + passivesFound++; + break; + case REALTIME: + writersFound++; + break; + default: + throw new AssertionError("Unexpected replica type"); + } + } + } + return numWriter == writersFound && numActive == activesFound && numPassive == passivesFound; + }; + } } \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java index 48ac91f4a6c2..340adbbbcb26 100644 --- a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java @@ -27,6 +27,7 @@ import org.apache.solr.cloud.Overseer.LeaderStatus; import org.apache.solr.cloud.OverseerTaskQueue.QueueEvent; import org.apache.solr.common.cloud.ClusterState; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; @@ -346,7 +347,7 @@ protected void verifySubmitCaptures( assertEquals(numberOfSlices * numberOfReplica, coreNames.size()); for (int i = 1; i <= numberOfSlices; i++) { for (int j = 1; j <= numberOfReplica; j++) { - String coreName = COLLECTION_NAME + "_shard" + i + "_replica" + j; + String coreName = Assign.buildCoreName(COLLECTION_NAME, "shard" + i, Replica.Type.REALTIME, j); assertTrue("Shard " + coreName + " was not created", coreNames.contains(coreName)); diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java index b0721a2b8e07..f5c5db0ec86e 100644 --- a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java @@ -1210,7 +1210,7 @@ public void testExternalClusterStateChangeBehavior() throws Exception { ZkStateReader.STATE_PROP, Replica.State.RECOVERING.toString()); q.offer(Utils.toJSON(m)); - + m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.STATE.toLower(), ZkStateReader.BASE_URL_PROP, "http://127.0.0.1/solr", @@ -1221,7 +1221,7 @@ public void testExternalClusterStateChangeBehavior() throws Exception { ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString()); q.offer(Utils.toJSON(m)); - + Stat stat = new Stat(); byte[] data = zkClient.getData("/clusterstate.json", null, stat, true); // Simulate an external modification diff --git a/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java b/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java index a8e14bf5465e..1da9aca086ea 100644 --- a/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java @@ -40,8 +40,8 @@ public RecoveryAfterSoftCommitTest() { } @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1: -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } @BeforeClass diff --git a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java index bf9b5e014c88..e00ea0d639ae 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java @@ -79,6 +79,11 @@ public class ShardSplitTest extends BasicDistributedZkTest { public ShardSplitTest() { schemaString = "schema15.xml"; // we need a string id } + + @Override + protected boolean useAppendReplicas() { + return false; + } @Override public void distribSetUp() throws Exception { @@ -86,12 +91,6 @@ public void distribSetUp() throws Exception { useFactory(null); } - //TODO for now, onlyLeaderIndexes do not work with ShardSplitTest - @Override - protected int getRealtimeReplicas() { - return -1; - } - @Test public void test() throws Exception { diff --git a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java new file mode 100644 index 000000000000..fe353d25bdd7 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java @@ -0,0 +1,808 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.CollectionStatePredicate; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.core.SolrCore; +import org.apache.solr.update.DirectUpdateHandler2; +import org.apache.solr.update.SolrIndexWriter; +import org.apache.solr.update.UpdateHandler; +import org.apache.solr.update.UpdateLog; +import org.apache.solr.util.RefCounted; +import org.apache.solr.util.TestInjection; +import org.apache.solr.util.TimeOut; +import org.apache.zookeeper.KeeperException; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.carrotsearch.randomizedtesting.annotations.Repeat; + +@Slow +public class TestAppendReplica extends SolrCloudTestCase { + + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private String collectionName = null; + private final static int REPLICATION_TIMEOUT_SECS = 10; + + private String suggestedCollectionName() { + return (getTestClass().getSimpleName().replace("Test", "") + "_" + getTestName().split(" ")[0]).replaceAll("(.)(\\p{Upper})", "$1_$2").toLowerCase(Locale.ROOT); + } + + @BeforeClass + public static void setupCluster() throws Exception { + TestInjection.waitForReplicasInSync = null; // We'll be explicit about this in this test + configureCluster(2) // 2 + random().nextInt(3) + .addConfig("conf", configset("cloud-minimal-inplace-updates")) + .configure(); + CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false"); + CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient()); + assertEquals(0, response.getStatus()); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + collectionName = suggestedCollectionName(); + expectThrows(SolrException.class, () -> getCollectionState(collectionName)); + } + + @Override + public void tearDown() throws Exception { + for (JettySolrRunner jetty:cluster.getJettySolrRunners()) { + if (!jetty.isRunning()) { + LOG.warn("Jetty {} not running, probably some bad test. Starting it", jetty.getLocalPort()); + ChaosMonkey.start(jetty); + } + } + if (cluster.getSolrClient().getZkStateReader().getClusterState().getCollectionOrNull(collectionName) != null) { + LOG.info("tearDown deleting collection"); + CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); + waitForDeletion(collectionName); + } + super.tearDown(); + } + + // Just to compare test time, nocommit + @Ignore + public void testCreateDelete2() throws Exception { + try { + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 8, 0, 0).process(cluster.getSolrClient()); + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); +// assertEquals("Expecting 4 relpicas per shard", +// 8, docCollection.getReplicas().size()); +// assertEquals("Expecting 6 passive replicas, 3 per shard", +// 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); +// assertEquals("Expecting 2 writer replicas, one per shard", +// 2, docCollection.getReplicas(EnumSet.of(Replica.Type.WRITER)).size()); +// for (Slice s:docCollection.getSlices()) { +// // read-only replicas can never become leaders +// assertFalse(s.getLeader().isReadOnly()); +// } + } finally { + zkClient().printLayoutToStdOut(); + } + } + + /** + * Asserts that Update logs exist for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#REALTIME}, but not + * for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} + */ + private void assertUlogPresence(DocCollection collection) { + for (Slice s:collection.getSlices()) { + for (Replica r:s.getReplicas()) { + SolrCore core = null; + try { + core = cluster.getReplicaJetty(r).getCoreContainer().getCore(r.getCoreName()); + assertNotNull(core); + assertTrue("Update log should exist for replicas of type Append", + new java.io.File(core.getUlogDir()).exists()); + } finally { + core.close(); + } + } + } + } + + @Repeat(iterations=2) // 2 times to make sure cleanup is complete and we can create the same collection + public void testCreateDelete() throws Exception { + try { + CollectionAdminRequest.createCollection(collectionName, "conf", 2, 0, 4, 0) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); + assertEquals("Expecting 2 shards", + 2, docCollection.getSlices().size()); + assertEquals("Expecting 4 relpicas per shard", + 8, docCollection.getReplicas().size()); + assertEquals("Expecting 8 append replicas, 4 per shard", + 8, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).size()); + assertEquals("Expecting no realtime replicas", + 0, docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); + assertEquals("Expecting no passive replicas", + 0, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); + for (Slice s:docCollection.getSlices()) { + assertTrue(s.getLeader().getType() == Replica.Type.APPEND); + List shardElectionNodes = cluster.getZkClient().getChildren(ZkStateReader.getShardLeadersElectPath(collectionName, s.getName()), null, true); + assertEquals("Unexpected election nodes for Shard: " + s.getName() + ": " + Arrays.toString(shardElectionNodes.toArray()), + 4, shardElectionNodes.size()); + } + assertUlogPresence(docCollection); + } finally { + zkClient().printLayoutToStdOut(); + } + } + + public void testAddDocs() throws Exception { + int numAppendReplicas = 1 + random().nextInt(3); + DocCollection docCollection = createAndWaitForCollection(1, 0, numAppendReplicas, 0); + assertEquals(1, docCollection.getSlices().size()); + + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + + Slice s = docCollection.getSlices().iterator().next(); + try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) { + assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + } + + TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS); + for (Replica r:s.getReplicas(EnumSet.of(Replica.Type.APPEND))) { + //TODO: assert replication < REPLICATION_TIMEOUT_SECS + try (HttpSolrClient appendReplicaClient = getHttpSolrClient(r.getCoreUrl())) { + while (true) { + try { + assertEquals("Replica " + r.getName() + " not up to date after 10 seconds", + 1, appendReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + // Append replicas process all updates + SolrQuery req = new SolrQuery( + "qt", "/admin/plugins", + "stats", "true"); + QueryResponse statsResponse = appendReplicaClient.query(req); +// TODO: uncomment when SOLR-10569 is fixed +// assertEquals("Append replicas should recive all updates. Replica: " + r + ", response: " + statsResponse, +// 1L, ((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats", "cumulative_adds")); + break; + } catch (AssertionError e) { + if (t.hasTimedOut()) { + throw e; + } else { + Thread.sleep(100); + } + } + } + } + } + assertUlogPresence(docCollection); + } + + public void testAddRemoveAppendReplica() throws Exception { + DocCollection docCollection = createAndWaitForCollection(2, 0, 1, 0); + assertEquals(2, docCollection.getSlices().size()); + + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.APPEND).process(cluster.getSolrClient()); + docCollection = assertNumberOfReplicas(0, 3, 0, true, false); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard2", Replica.Type.APPEND).process(cluster.getSolrClient()); + docCollection = assertNumberOfReplicas(0, 4, 0, true, false); + + waitForState("Expecting collection to have 2 shards and 2 replica each", collectionName, clusterShape(2, 2)); + + //Delete passive replica from shard1 + CollectionAdminRequest.deleteReplica( + collectionName, + "shard1", + docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.APPEND)).get(0).getName()) + .process(cluster.getSolrClient()); + assertNumberOfReplicas(0, 3, 0, true, true); + } + + public void testRemoveLeader() throws Exception { + doReplaceLeader(true); + } + + public void testKillLeader() throws Exception { + doReplaceLeader(false); + } + + public void testPassiveReplicaStates() { + // Validate that passive replicas go through the correct states when starting, stopping, reconnecting + } + + public void testPassiveReplicaCantConnectToZooKeeper() { + + } + + public void testRealTimeGet() { + // should be redirected to writers or error + } + + /* + * validate leader election and that replication still happens on a new leader + */ + private void doReplaceLeader(boolean removeReplica) throws Exception { + DocCollection docCollection = createAndWaitForCollection(1, 0, 2, 0); + + // Add a document and commit + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + Slice s = docCollection.getSlices().iterator().next(); + try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) { + assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + } + + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)), REPLICATION_TIMEOUT_SECS); + + // Delete leader replica from shard1 + JettySolrRunner leaderJetty = null; + if (removeReplica) { + CollectionAdminRequest.deleteReplica( + collectionName, + "shard1", + s.getLeader().getName()) + .process(cluster.getSolrClient()); + } else { + leaderJetty = cluster.getReplicaJetty(s.getLeader()); + ChaosMonkey.kill(leaderJetty); + waitForState("Leader replica not removed", collectionName, clusterShape(1, 1)); + // Wait for cluster state to be updated + waitForState("Replica state not updated in cluster state", + collectionName, clusterStateReflectsActiveAndDownReplicas()); + } + docCollection = assertNumberOfReplicas(0, 1, 0, true, true); + + // Wait until a new leader is elected + TimeOut t = new TimeOut(30, TimeUnit.SECONDS); + while (!t.hasTimedOut()) { + docCollection = getCollectionState(collectionName); + Replica leader = docCollection.getSlice("shard1").getLeader(); + if (leader != null && leader.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())) { + break; + } + Thread.sleep(500); + } + assertFalse("Timeout waiting for a new leader to be elected", t.hasTimedOut()); + + // There is a new leader, I should be able to add and commit + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "zoo")); + cluster.getSolrClient().commit(collectionName); + + // Queries should still work + waitForNumDocsInAllReplicas(2, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)), REPLICATION_TIMEOUT_SECS); + // Start back the node + if (removeReplica) { + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.APPEND).process(cluster.getSolrClient()); + } else { + ChaosMonkey.start(leaderJetty); + } + waitForState("Expected collection to be 1x2", collectionName, clusterShape(1, 2)); + // added replica should replicate from the leader + waitForNumDocsInAllReplicas(2, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)), REPLICATION_TIMEOUT_SECS); + } + + public void testKillAppendReplica() throws Exception { + DocCollection docCollection = createAndWaitForCollection(1, 0, 2, 0); + + waitForNumDocsInAllActiveReplicas(0); + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + waitForNumDocsInAllActiveReplicas(1); + + JettySolrRunner passiveReplicaJetty = cluster.getReplicaJetty(docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.APPEND)).get(0)); + ChaosMonkey.kill(passiveReplicaJetty); + waitForState("Replica not removed", collectionName, activeReplicaCount(0, 1, 0)); + // Also wait for the replica to be placed in state="down" + waitForState("Didn't update state", collectionName, clusterStateReflectsActiveAndDownReplicas()); + + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + waitForNumDocsInAllActiveReplicas(2); + + ChaosMonkey.start(passiveReplicaJetty); + waitForState("Replica not added", collectionName, activeReplicaCount(0, 2, 0)); + waitForNumDocsInAllActiveReplicas(2); + } + + public void testSearchWhileReplicationHappens() { + + } + + public void testReplication() { + // Validate incremental replication + } + + public void testOnlyLeaderIndexes() throws Exception { + createAndWaitForCollection(1, 0, 2, 0); + + CloudSolrClient cloudClient = cluster.getSolrClient(); + new UpdateRequest() + .add(sdoc("id", "1")) + .add(sdoc("id", "2")) + .add(sdoc("id", "3")) + .add(sdoc("id", "4")) + .process(cloudClient, collectionName); + + { + UpdateHandler updateHandler = getSolrCore(true).get(0).getUpdateHandler(); + RefCounted iwRef = updateHandler.getSolrCoreState().getIndexWriter(null); + assertTrue("IndexWriter at leader must see updates ", iwRef.get().hasUncommittedChanges()); + iwRef.decref(); + } + + for (SolrCore solrCore : getSolrCore(false)) { + RefCounted iwRef = solrCore.getUpdateHandler().getSolrCoreState().getIndexWriter(null); + assertFalse("IndexWriter at replicas must not see updates ", iwRef.get().hasUncommittedChanges()); + iwRef.decref(); + } + + checkRTG(1, 4, cluster.getJettySolrRunners()); + + new UpdateRequest() + .deleteById("1") + .deleteByQuery("id:2") + .process(cloudClient, collectionName); + + // The DBQ is not processed at replicas, so we still can get doc2 and other docs by RTG + checkRTG(2,4, getSolrRunner(false)); + + new UpdateRequest() + .commit(cloudClient, collectionName); + + waitForNumDocsInAllActiveReplicas(2); + + // Update log roll over + for (SolrCore solrCore : getSolrCore(false)) { + UpdateLog updateLog = solrCore.getUpdateHandler().getUpdateLog(); + assertFalse(updateLog.hasUncommittedChanges()); + } + + // UpdateLog copy over old updates + for (int i = 15; i <= 150; i++) { + cloudClient.add(collectionName, sdoc("id",String.valueOf(i))); + if (random().nextInt(100) < 15 & i != 150) { + cloudClient.commit(collectionName); + } + } + checkRTG(120,150, cluster.getJettySolrRunners()); + waitForReplicasCatchUp(20); + } + + public void testRecovery() throws Exception { + boolean useKill = random().nextBoolean(); + createAndWaitForCollection(1, 0, 2, 0); + + CloudSolrClient cloudClient = cluster.getSolrClient(); + new UpdateRequest() + .add(sdoc("id", "3")) + .add(sdoc("id", "4")) + .commit(cloudClient, collectionName); + // Replica recovery + new UpdateRequest() + .add(sdoc("id", "5")) + .process(cloudClient, collectionName); + JettySolrRunner solrRunner = getSolrRunner(false).get(0); + if (useKill) { + ChaosMonkey.kill(solrRunner); + } else { + ChaosMonkey.stop(solrRunner); + } + new UpdateRequest() + .add(sdoc("id", "6")) + .process(cloudClient, collectionName); + ChaosMonkey.start(solrRunner); + waitForState("Replica didn't recover", collectionName, activeReplicaCount(0,2,0)); + // We skip peerSync, so replica will always trigger commit on leader + waitForNumDocsInAllActiveReplicas(4); + + // If I add the doc immediately, the leader fails to communicate with the follower with broken pipe. Related to SOLR-9555 I believe + //nocommit + Thread.sleep(10000); + + // More Replica recovery testing + new UpdateRequest() + .add(sdoc("id", "7")) + .process(cloudClient, collectionName); + checkRTG(3,7, cluster.getJettySolrRunners()); + DirectUpdateHandler2.commitOnClose = false; + ChaosMonkey.stop(solrRunner); + DirectUpdateHandler2.commitOnClose = true; + ChaosMonkey.start(solrRunner); + waitForState("Replica didn't recover", collectionName, activeReplicaCount(0,2,0)); + checkRTG(3,7, cluster.getJettySolrRunners()); + waitForNumDocsInAllActiveReplicas(5, 0); + + // Test replica recovery apply buffer updates + Semaphore waitingForBufferUpdates = new Semaphore(0); + Semaphore waitingForReplay = new Semaphore(0); + RecoveryStrategy.testing_beforeReplayBufferingUpdates = () -> { + try { + waitingForReplay.release(); + waitingForBufferUpdates.acquire(); + } catch (InterruptedException e) { + e.printStackTrace(); + fail("Test interrupted: " + e.getMessage()); + } + }; + if (useKill) { + ChaosMonkey.kill(solrRunner); + } else { + ChaosMonkey.stop(solrRunner); + } + ChaosMonkey.start(solrRunner); + waitingForReplay.acquire(); + new UpdateRequest() + .add(sdoc("id", "8")) + .add(sdoc("id", "9")) + .process(cloudClient, collectionName); + waitingForBufferUpdates.release(); + RecoveryStrategy.testing_beforeReplayBufferingUpdates = null; + waitForState("Replica didn't recover", collectionName, activeReplicaCount(0,2,0)); + checkRTG(3,9, cluster.getJettySolrRunners()); + waitForNumDocsInAllActiveReplicas(5, 0); + for (SolrCore solrCore : getSolrCore(false)) { + RefCounted iwRef = solrCore.getUpdateHandler().getSolrCoreState().getIndexWriter(null); + assertFalse("IndexWriter at replicas must not see updates ", iwRef.get().hasUncommittedChanges()); + iwRef.decref(); + } + } + + public void testDeleteById() throws Exception{ + createAndWaitForCollection(1,0,2,0); + CloudSolrClient cloudClient = cluster.getSolrClient(); + new UpdateRequest() + .deleteByQuery("*:*") + .commit(cluster.getSolrClient(), collectionName); + new UpdateRequest() + .add(sdoc("id", "1")) + .commit(cloudClient, collectionName); + waitForNumDocsInAllActiveReplicas(1); + new UpdateRequest() + .deleteById("1") + .process(cloudClient, collectionName); + boolean successs = false; + try { + checkRTG(1, 1, cluster.getJettySolrRunners()); + successs = true; + } catch (AssertionError e) { + //expected + } + assertFalse("Doc1 is deleted but it's still exist", successs); + } + + public void testBasicLeaderElection() throws Exception { + createAndWaitForCollection(1,0,2,0); + CloudSolrClient cloudClient = cluster.getSolrClient(); + new UpdateRequest() + .deleteByQuery("*:*") + .commit(cluster.getSolrClient(), collectionName); + new UpdateRequest() + .add(sdoc("id", "1")) + .add(sdoc("id", "2")) + .process(cloudClient, collectionName); + JettySolrRunner oldLeaderJetty = getSolrRunner(true).get(0); + ChaosMonkey.kill(oldLeaderJetty); + waitForState("Replica not removed", collectionName, activeReplicaCount(0, 1, 0)); + new UpdateRequest() + .add(sdoc("id", "3")) + .add(sdoc("id", "4")) + .process(cloudClient, collectionName); + ChaosMonkey.start(oldLeaderJetty); + waitForState("Replica not added", collectionName, activeReplicaCount(0, 2, 0)); + checkRTG(1,4, cluster.getJettySolrRunners()); + new UpdateRequest() + .commit(cloudClient, collectionName); + waitForNumDocsInAllActiveReplicas(4, 0); + } + + public void testOutOfOrderDBQWithInPlaceUpdates() throws Exception { + createAndWaitForCollection(1,0,2,0); + assertFalse(getSolrCore(true).get(0).getLatestSchema().getField("inplace_updatable_int").indexed()); + assertFalse(getSolrCore(true).get(0).getLatestSchema().getField("inplace_updatable_int").stored()); + assertTrue(getSolrCore(true).get(0).getLatestSchema().getField("inplace_updatable_int").hasDocValues()); + List updates = new ArrayList<>(); + updates.add(simulatedUpdateRequest(null, "id", 1, "title_s", "title0_new", "inplace_updatable_int", 5, "_version_", 1L)); // full update + updates.add(simulatedDBQ("inplace_updatable_int:5", 3L)); + updates.add(simulatedUpdateRequest(1L, "id", 1, "inplace_updatable_int", 6, "_version_", 2L)); + for (JettySolrRunner solrRunner: getSolrRunner(false)) { + try (SolrClient client = solrRunner.newClient()) { + for (UpdateRequest up : updates) { + up.process(client, collectionName); + } + } + } + JettySolrRunner oldLeaderJetty = getSolrRunner(true).get(0); + ChaosMonkey.kill(oldLeaderJetty); + waitForState("Replica not removed", collectionName, activeReplicaCount(0, 1, 0)); + ChaosMonkey.start(oldLeaderJetty); + waitForState("Replica not added", collectionName, activeReplicaCount(0, 2, 0)); + checkRTG(1,1, cluster.getJettySolrRunners()); + SolrDocument doc = cluster.getSolrClient().getById(collectionName,"1"); + assertNotNull(doc.get("title_s")); + } + + private UpdateRequest simulatedUpdateRequest(Long prevVersion, Object... fields) throws SolrServerException, IOException { + SolrInputDocument doc = sdoc(fields); + + // get baseUrl of the leader + String baseUrl = getBaseUrl(); + + UpdateRequest ur = new UpdateRequest(); + ur.add(doc); + ur.setParam("update.distrib", "FROMLEADER"); + if (prevVersion != null) { + ur.setParam("distrib.inplace.prevversion", String.valueOf(prevVersion)); + ur.setParam("distrib.inplace.update", "true"); + } + ur.setParam("distrib.from", baseUrl); + return ur; + } + + private UpdateRequest simulatedDBQ(String query, long version) throws SolrServerException, IOException { + String baseUrl = getBaseUrl(); + + UpdateRequest ur = new UpdateRequest(); + ur.deleteByQuery(query); + ur.setParam("_version_", ""+version); + ur.setParam("update.distrib", "FROMLEADER"); + ur.setParam("distrib.from", baseUrl); + return ur; + } + + private String getBaseUrl() { + DocCollection collection = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(collectionName); + Slice slice = collection.getSlice("shard1"); + return slice.getLeader().getCoreUrl(); + } + + private DocCollection createAndWaitForCollection(int numShards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) throws SolrServerException, IOException, KeeperException, InterruptedException { + CollectionAdminRequest.createCollection(collectionName, "conf", numShards, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + int numReplicasPerShard = numRealtimeReplicas + numAppendReplicas + numPassiveReplicas; + cluster.getSolrClient().getZkStateReader().registerCore(collectionName); //TODO: Is this needed? + waitForState("Expected collection to be created with " + numShards + " shards and " + numReplicasPerShard + " replicas", + collectionName, clusterShape(numShards, numReplicasPerShard)); + return assertNumberOfReplicas(numRealtimeReplicas*numShards, numAppendReplicas*numShards, numPassiveReplicas*numShards, false, true); + } + + private void waitForNumDocsInAllActiveReplicas(int numDocs) throws IOException, SolrServerException, InterruptedException { + waitForNumDocsInAllActiveReplicas(numDocs, REPLICATION_TIMEOUT_SECS); + } + + private void waitForNumDocsInAllActiveReplicas(int numDocs, int timeout) throws IOException, SolrServerException, InterruptedException { + DocCollection docCollection = getCollectionState(collectionName); + waitForNumDocsInAllReplicas(numDocs, docCollection.getReplicas().stream().filter(r -> r.getState() == Replica.State.ACTIVE).collect(Collectors.toList()), timeout); + } + + private void waitForNumDocsInAllReplicas(int numDocs, Collection replicas, int timeout) throws IOException, SolrServerException, InterruptedException { + waitForNumDocsInAllReplicas(numDocs, replicas, "*:*", timeout); + } + + private void waitForNumDocsInAllReplicas(int numDocs, Collection replicas, String query, int timeout) throws IOException, SolrServerException, InterruptedException { + TimeOut t = new TimeOut(timeout, TimeUnit.SECONDS); + for (Replica r:replicas) { + if (!r.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())) { + continue; + } + try (HttpSolrClient replicaClient = getHttpSolrClient(r.getCoreUrl())) { + while (true) { + try { + assertEquals("Replica " + r.getName() + " not up to date after " + REPLICATION_TIMEOUT_SECS + " seconds", + numDocs, replicaClient.query(new SolrQuery(query)).getResults().getNumFound()); + break; + } catch (AssertionError e) { + if (t.hasTimedOut()) { + throw e; + } else { + Thread.sleep(100); + } + } + } + } + } + } + + private void waitForDeletion(String collection) throws InterruptedException, KeeperException { + TimeOut t = new TimeOut(10, TimeUnit.SECONDS); + while (cluster.getSolrClient().getZkStateReader().getClusterState().hasCollection(collection)) { + try { + Thread.sleep(100); + if (t.hasTimedOut()) { + fail("Timed out waiting for collection " + collection + " to be deleted."); + } + cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collection); + } catch(SolrException e) { + return; + } + + } + } + + private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int numPassive, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { + if (updateCollection) { + cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collectionName); + } + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); + assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, + docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, + docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of active replicas: " + docCollection, numActive, + docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + return docCollection; + } + + /* + * passes only if all replicas are active or down, and the "liveNodes" reflect the same status + */ + private CollectionStatePredicate clusterStateReflectsActiveAndDownReplicas() { + return (liveNodes, collectionState) -> { + for (Replica r:collectionState.getReplicas()) { + if (r.getState() != Replica.State.DOWN && r.getState() != Replica.State.ACTIVE) { + return false; + } + if (r.getState() == Replica.State.DOWN && liveNodes.contains(r.getNodeName())) { + return false; + } + if (r.getState() == Replica.State.ACTIVE && !liveNodes.contains(r.getNodeName())) { + return false; + } + } + return true; + }; + } + + + private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive, int numPassive) { + return (liveNodes, collectionState) -> { + int writersFound = 0, activesFound = 0, passivesFound = 0; + if (collectionState == null) + return false; + for (Slice slice : collectionState) { + for (Replica replica : slice) { + if (replica.isActive(liveNodes)) + switch (replica.getType()) { + case APPEND: + activesFound++; + break; + case PASSIVE: + passivesFound++; + break; + case REALTIME: + writersFound++; + break; + default: + throw new AssertionError("Unexpected replica type"); + } + } + } + return numWriter == writersFound && numActive == activesFound && numPassive == passivesFound; + }; + } + + private List getSolrCore(boolean isLeader) { + List rs = new ArrayList<>(); + + CloudSolrClient cloudClient = cluster.getSolrClient(); + DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(collectionName); + + for (JettySolrRunner solrRunner : cluster.getJettySolrRunners()) { + if (solrRunner.getCoreContainer() == null) continue; + for (SolrCore solrCore : solrRunner.getCoreContainer().getCores()) { + CloudDescriptor cloudDescriptor = solrCore.getCoreDescriptor().getCloudDescriptor(); + Slice slice = docCollection.getSlice(cloudDescriptor.getShardId()); + Replica replica = docCollection.getReplica(cloudDescriptor.getCoreNodeName()); + if (slice.getLeader().equals(replica) && isLeader) { + rs.add(solrCore); + } else if (!slice.getLeader().equals(replica) && !isLeader) { + rs.add(solrCore); + } + } + } + return rs; + } + + private void checkRTG(int from, int to, List solrRunners) throws Exception{ + for (JettySolrRunner solrRunner: solrRunners) { + try (SolrClient client = solrRunner.newClient()) { + for (int i = from; i <= to; i++) { + SolrQuery query = new SolrQuery(); + query.set("distrib", false); + query.setRequestHandler("/get"); + query.set("id",i); + QueryResponse res = client.query(collectionName, query); + assertNotNull("Can not find doc "+ i + " in " + solrRunner.getBaseUrl(),res.getResponse().get("doc")); + } + } + } + } + + private List getSolrRunner(boolean isLeader) { + List rs = new ArrayList<>(); + CloudSolrClient cloudClient = cluster.getSolrClient(); + DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(collectionName); + for (JettySolrRunner solrRunner : cluster.getJettySolrRunners()) { + if (solrRunner.getCoreContainer() == null) continue; + for (SolrCore solrCore : solrRunner.getCoreContainer().getCores()) { + CloudDescriptor cloudDescriptor = solrCore.getCoreDescriptor().getCloudDescriptor(); + Slice slice = docCollection.getSlice(cloudDescriptor.getShardId()); + Replica replica = docCollection.getReplica(cloudDescriptor.getCoreNodeName()); + if (slice.getLeader() == replica && isLeader) { + rs.add(solrRunner); + } else if (slice.getLeader() != replica && !isLeader) { + rs.add(solrRunner); + } + } + } + return rs; + } + + private void waitForReplicasCatchUp(int numTry) throws IOException, InterruptedException { + String leaderTimeCommit = getSolrCore(true).get(0).getDeletionPolicy().getLatestCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); + if (leaderTimeCommit == null) return; + for (int i = 0; i < numTry; i++) { + boolean inSync = true; + for (SolrCore solrCore : getSolrCore(false)) { + String replicateTimeCommit = solrCore.getDeletionPolicy().getLatestCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); + if (!leaderTimeCommit.equals(replicateTimeCommit)) { + inSync = false; + Thread.sleep(500); + break; + } + } + if (inSync) return; + } + + fail("Some replicas are not in sync with leader"); + + } +} diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java index b592861fb300..965c169718f9 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java @@ -66,8 +66,7 @@ public static void setupCluster() throws Exception { onlyLeaderIndexes = random().nextBoolean(); CollectionAdminRequest - .createCollection(COLLECTION, "config", 2, 2) - .setRealtimeReplicas(onlyLeaderIndexes? 1: -1) + .createCollection(COLLECTION, "config", 2, onlyLeaderIndexes?0:2,onlyLeaderIndexes?2:0,0) .setMaxShardsPerNode(2) .process(cluster.getSolrClient()); AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java b/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java index 8fbfee391e0f..dd55f23c30fd 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java @@ -61,7 +61,6 @@ public TestCollectionAPI() { public void test() throws Exception { try (CloudSolrClient client = createCloudClient(null)) { CollectionAdminRequest.Create req = CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf1",2,2); - req.setRealtimeReplicas(1); req.setMaxShardsPerNode(2); client.request(req); createCollection(null, COLLECTION_NAME1, 1, 1, 1, client, null, "conf1"); @@ -173,7 +172,7 @@ private void clusterStatusWithCollection() throws IOException, SolrServerExcepti Map collection = (Map) collections.get(COLLECTION_NAME); assertNotNull(collection); assertEquals("conf1", collection.get("configName")); - assertEquals("1", collection.get("realtimeReplicas")); +// assertEquals("1", collection.get("realtimeReplicas")); } } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java new file mode 100644 index 000000000000..80cc25e6c5b9 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java @@ -0,0 +1,549 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.CollectionStatePredicate; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.util.TestInjection; +import org.apache.solr.util.TimeOut; +import org.apache.zookeeper.KeeperException; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.carrotsearch.randomizedtesting.annotations.Repeat; + +@Slow +public class TestPassiveReplica extends SolrCloudTestCase { + + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + // TODO: Make sure that FORCELEADER can't be used with Passive + // TODO: Backup/Snapshot should not work on passive replicas + // TODO: ADDSHARD operation + + private String collectionName = null; + private final static int REPLICATION_TIMEOUT_SECS = 10; + + private String suggestedCollectionName() { + return (getTestClass().getSimpleName().replace("Test", "") + "_" + getTestName().split(" ")[0]).replaceAll("(.)(\\p{Upper})", "$1_$2").toLowerCase(Locale.ROOT); + } + + @BeforeClass + public static void setupCluster() throws Exception { + TestInjection.waitForReplicasInSync = null; // We'll be explicit about this in this test + configureCluster(2) // 2 + random().nextInt(3) + .addConfig("conf", configset("cloud-minimal")) + .configure(); + CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false"); + CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient()); + assertEquals(0, response.getStatus()); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + collectionName = suggestedCollectionName(); + expectThrows(SolrException.class, () -> getCollectionState(collectionName)); + } + + @Override + public void tearDown() throws Exception { + for (JettySolrRunner jetty:cluster.getJettySolrRunners()) { + if (!jetty.isRunning()) { + LOG.warn("Jetty {} not running, probably some bad test. Starting it", jetty.getLocalPort()); + ChaosMonkey.start(jetty); + } + } + if (cluster.getSolrClient().getZkStateReader().getClusterState().getCollectionOrNull(collectionName) != null) { + LOG.info("tearDown deleting collection"); + CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); + LOG.info("Collection deleted"); + waitForDeletion(collectionName); + } + super.tearDown(); + } + + // Just to compare test time, nocommit + @Ignore + public void testCreateDelete2() throws Exception { + try { + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 8, 0, 0).process(cluster.getSolrClient()); + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); +// assertEquals("Expecting 4 relpicas per shard", +// 8, docCollection.getReplicas().size()); +// assertEquals("Expecting 6 passive replicas, 3 per shard", +// 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); +// assertEquals("Expecting 2 writer replicas, one per shard", +// 2, docCollection.getReplicas(EnumSet.of(Replica.Type.WRITER)).size()); +// for (Slice s:docCollection.getSlices()) { +// // read-only replicas can never become leaders +// assertFalse(s.getLeader().isReadOnly()); +// } + } finally { + zkClient().printLayoutToStdOut(); + } + } + + @Repeat(iterations=2) // 2 times to make sure cleanup is complete and we can create the same collection + public void testCreateDelete() throws Exception { + try { + CollectionAdminRequest.createCollection(collectionName, "conf", 2, 1, 0, 3) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); + assertEquals("Expecting 4 relpicas per shard", + 8, docCollection.getReplicas().size()); + assertEquals("Expecting 6 passive replicas, 3 per shard", + 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); + assertEquals("Expecting 2 writer replicas, one per shard", + 2, docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); + for (Slice s:docCollection.getSlices()) { + // read-only replicas can never become leaders + assertFalse(s.getLeader().getType() == Replica.Type.PASSIVE); + List shardElectionNodes = cluster.getZkClient().getChildren(ZkStateReader.getShardLeadersElectPath(collectionName, s.getName()), null, true); + assertEquals("Unexpected election nodes for Shard: " + s.getName() + ": " + Arrays.toString(shardElectionNodes.toArray()), + 1, shardElectionNodes.size()); + } + assertUlogPresence(docCollection); + } finally { + zkClient().printLayoutToStdOut(); + } + } + + /** + * Asserts that Update logs exist for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#REALTIME}, but not + * for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} + */ + private void assertUlogPresence(DocCollection collection) { + for (Slice s:collection.getSlices()) { + for (Replica r:s.getReplicas()) { + SolrCore core = null; + try { + core = cluster.getReplicaJetty(r).getCoreContainer().getCore(r.getCoreName()); + assertNotNull(core); + assertEquals("Update log should not exist for replicas of type Passive", r.getType() == Replica.Type.REALTIME, + new java.io.File(core.getUlogDir()).exists()); + } finally { + core.close(); + } + } + } + } + + @SuppressWarnings("unchecked") + public void testAddDocs() throws Exception { + int numReadOnlyReplicas = 1 + random().nextInt(3); + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numReadOnlyReplicas) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + waitForState("Expected collection to be created with 1 shard and " + (numReadOnlyReplicas + 1) + " replicas", collectionName, clusterShape(1, numReadOnlyReplicas + 1)); + DocCollection docCollection = assertNumberOfReplicas(1, 0, numReadOnlyReplicas, false, true); + assertEquals(1, docCollection.getSlices().size()); + + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + + Slice s = docCollection.getSlices().iterator().next(); + try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) { + leaderClient.commit(); // TODO: this shouldn't be necessary here + assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + } + + TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS); + for (Replica r:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE))) { + //TODO: assert replication < REPLICATION_TIMEOUT_SECS + try (HttpSolrClient readOnlyReplicaClient = getHttpSolrClient(r.getCoreUrl())) { + while (true) { + try { + assertEquals("Replica " + r.getName() + " not up to date after 10 seconds", + 1, readOnlyReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + break; + } catch (AssertionError e) { + if (t.hasTimedOut()) { + throw e; + } else { + Thread.sleep(100); + } + } + } + SolrQuery req = new SolrQuery( + "qt", "/admin/plugins", + "stats", "true"); + QueryResponse statsResponse = readOnlyReplicaClient.query(req); +// assertEquals("Replicas shouldn't process the add document request: " + statsResponse, +// 0L, ((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats", "adds")); + assertEquals("Replicas shouldn't process the add document request: " + statsResponse, + 0L, ((Map)((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "/update", "stats")).get("UPDATE./update.requests")); + } + } + assertUlogPresence(docCollection); + } + + public void testAddRemovePassiveReplica() throws Exception { + CollectionAdminRequest.createCollection(collectionName, "conf", 2, 1, 0, 0) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + cluster.getSolrClient().getZkStateReader().registerCore(collectionName); //TODO: Is this needed? + waitForState("Expected collection to be created with 2 shards and 1 replica each", collectionName, clusterShape(2, 1)); + DocCollection docCollection = assertNumberOfReplicas(2, 0, 0, false, true); + assertEquals(2, docCollection.getSlices().size()); + + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.PASSIVE).process(cluster.getSolrClient()); + docCollection = assertNumberOfReplicas(2, 0, 1, true, false); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard2", Replica.Type.PASSIVE).process(cluster.getSolrClient()); + docCollection = assertNumberOfReplicas(2, 0, 2, true, false); + + waitForState("Expecting collection to have 2 shards and 2 replica each", collectionName, clusterShape(2, 2)); + + //Delete passive replica from shard1 + CollectionAdminRequest.deleteReplica( + collectionName, + "shard1", + docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getName()) + .process(cluster.getSolrClient()); + assertNumberOfReplicas(2, 0, 1, true, true); + } + + public void testRemoveAllWriterReplicas() throws Exception { + doTestNoLeader(true); + } + + public void testKillLeader() throws Exception { + doTestNoLeader(false); + } + + @Ignore("Ignore until I figure out a way to reliably record state transitions") + public void testPassiveReplicaStates() throws Exception { + // Validate that passive replicas go through the correct states when starting, stopping, reconnecting + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, 0) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); +// cluster.getSolrClient().getZkStateReader().registerCore(collectionName); //TODO: Is this needed? + waitForState("Replica not added", collectionName, activeReplicaCount(1, 0, 0)); + addDocs(500); + List statesSeen = new ArrayList<>(3); + cluster.getSolrClient().registerCollectionStateWatcher(collectionName, (liveNodes, collectionState) -> { + Replica r = collectionState.getSlice("shard1").getReplica("core_node2"); + LOG.info("CollectionStateWatcher state change: {}", r); + if (r == null) { + return false; + } + statesSeen.add(r.getState()); + LOG.info("CollectionStateWatcher saw state: {}", r.getState()); + return r.getState() == Replica.State.ACTIVE; + }); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.PASSIVE).process(cluster.getSolrClient()); + waitForState("Replica not added", collectionName, activeReplicaCount(1, 0, 1)); + zkClient().printLayoutToStdOut(); + LOG.info("Saw states: " + Arrays.toString(statesSeen.toArray())); + assertEquals("Expecting DOWN->RECOVERING->ACTIVE but saw: " + Arrays.toString(statesSeen.toArray()), 3, statesSeen.size()); + assertEquals("Expecting DOWN->RECOVERING->ACTIVE but saw: " + Arrays.toString(statesSeen.toArray()), Replica.State.DOWN, statesSeen.get(0)); + assertEquals("Expecting DOWN->RECOVERING->ACTIVE but saw: " + Arrays.toString(statesSeen.toArray()), Replica.State.RECOVERING, statesSeen.get(0)); + assertEquals("Expecting DOWN->RECOVERING->ACTIVE but saw: " + Arrays.toString(statesSeen.toArray()), Replica.State.ACTIVE, statesSeen.get(0)); + } + + public void testPassiveReplicaCantConnectToZooKeeper() { + + } + + public void testRealTimeGet() { + // should be redirected to writers + } + + /* + * validate that replication still happens on a new leader + */ + private void doTestNoLeader(boolean removeReplica) throws Exception { + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, 1) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + cluster.getSolrClient().getZkStateReader().registerCore(collectionName); //TODO: Is this needed? + waitForState("Expected collection to be created with 1 shard and 2 replicas", collectionName, clusterShape(1, 2)); + DocCollection docCollection = assertNumberOfReplicas(1, 0, 1, false, true); + + // Add a document and commit + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + Slice s = docCollection.getSlices().iterator().next(); + try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) { + assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + } + + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); + + // Delete leader replica from shard1 + ignoreException("No registered leader was found"); //These are expected + JettySolrRunner leaderJetty = null; + if (removeReplica) { + CollectionAdminRequest.deleteReplica( + collectionName, + "shard1", + s.getLeader().getName()) + .process(cluster.getSolrClient()); + } else { + leaderJetty = cluster.getReplicaJetty(s.getLeader()); + ChaosMonkey.kill(leaderJetty); + waitForState("Leader replica not removed", collectionName, clusterShape(1, 1)); + // Wait for cluster state to be updated + waitForState("Replica state not updated in cluster state", + collectionName, clusterStateReflectsActiveAndDownReplicas()); + } + docCollection = assertNumberOfReplicas(0, 0, 1, true, true); + + // Check that there is no leader for the shard + Replica leader = docCollection.getSlice("shard1").getLeader(); + assertTrue(leader == null || !leader.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())); + + // Passive replica on the other hand should be active + Replica passiveReplica = docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0); + assertTrue(passiveReplica.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())); + + // add document, this should fail since there is no leader. Passive replica should not accept the update + expectThrows(SolrException.class, () -> + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "zoo")) + ); + + // Also fails if I send the update to the passive replica explicitly + try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + expectThrows(SolrException.class, () -> + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "zoo")) + ); + } + + // Queries should still work + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); + // Add realtime replica back. Since there is no rt now, new rt will have no docs. There will be data loss, since the it will become the leader + // and passive replicas will replicate from it. Maybe we want to change this. Replicate from passive replicas is not a good idea, since they + // are by definition out of date. + if (removeReplica) { + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.REALTIME).process(cluster.getSolrClient()); + } else { + ChaosMonkey.start(leaderJetty); + } + waitForState("Expected collection to be 1x2", collectionName, clusterShape(1, 2)); + unIgnoreException("No registered leader was found"); // Should have a leader from now on + + // Validate that the new writer is the leader now + cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collectionName); + docCollection = getCollectionState(collectionName); + leader = docCollection.getSlice("shard1").getLeader(); + assertTrue(leader != null && leader.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())); + + //nocommit: If jetty is restarted, the replication is not forced, and replica doesn't replicate from leader until new docs are added. Is this the correct behavior? Why should these two cases be different? + if (removeReplica) { + // Passive replicas will replicate the empty index if a new replica was added and becomes leader + waitForNumDocsInAllReplicas(0, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); + } + + // add docs agin + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "zoo")); + s = docCollection.getSlices().iterator().next(); + try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) { + leaderClient.commit(); + assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + } + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)), "id:2"); + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); + } + + public void testKillPassiveReplica() throws Exception { + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, 1) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + cluster.getSolrClient().getZkStateReader().registerCore(collectionName); //TODO: Is this needed? + waitForState("Expected collection to be created with 1 shard and 2 replicas", collectionName, clusterShape(1, 2)); + DocCollection docCollection = assertNumberOfReplicas(1, 0, 1, false, true); + assertEquals(1, docCollection.getSlices().size()); + + waitForNumDocsInAllActiveReplicas(0); + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + waitForNumDocsInAllActiveReplicas(1); + + JettySolrRunner passiveReplicaJetty = cluster.getReplicaJetty(docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0)); + ChaosMonkey.kill(passiveReplicaJetty); + waitForState("Replica not removed", collectionName, activeReplicaCount(1, 0, 0)); + // Also wait for the replica to be placed in state="down" + waitForState("Didn't update state", collectionName, clusterStateReflectsActiveAndDownReplicas()); + + cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "bar")); + cluster.getSolrClient().commit(collectionName); + waitForNumDocsInAllActiveReplicas(2); + + ChaosMonkey.start(passiveReplicaJetty); + waitForState("Replica not added", collectionName, activeReplicaCount(1, 0, 1)); + waitForNumDocsInAllActiveReplicas(2); + } + + public void testAddDocsToPassive() { + + } + + public void testSearchWhileReplicationHappens() { + + } + + private void waitForNumDocsInAllActiveReplicas(int numDocs) throws IOException, SolrServerException, InterruptedException { + DocCollection docCollection = getCollectionState(collectionName); + waitForNumDocsInAllReplicas(numDocs, docCollection.getReplicas().stream().filter(r -> r.getState() == Replica.State.ACTIVE).collect(Collectors.toList())); + } + + private void waitForNumDocsInAllReplicas(int numDocs, Collection replicas) throws IOException, SolrServerException, InterruptedException { + waitForNumDocsInAllReplicas(numDocs, replicas, "*:*"); + } + + private void waitForNumDocsInAllReplicas(int numDocs, Collection replicas, String query) throws IOException, SolrServerException, InterruptedException { + TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS); + for (Replica r:replicas) { + try (HttpSolrClient replicaClient = getHttpSolrClient(r.getCoreUrl())) { + while (true) { + try { + assertEquals("Replica " + r.getName() + " not up to date after " + REPLICATION_TIMEOUT_SECS + " seconds", + numDocs, replicaClient.query(new SolrQuery(query)).getResults().getNumFound()); + break; + } catch (AssertionError e) { + if (t.hasTimedOut()) { + throw e; + } else { + Thread.sleep(100); + } + } + } + } + } + } + + private void waitForDeletion(String collection) throws InterruptedException, KeeperException { + TimeOut t = new TimeOut(10, TimeUnit.SECONDS); + while (cluster.getSolrClient().getZkStateReader().getClusterState().hasCollection(collection)) { + LOG.info("Collection not yet deleted"); + try { + Thread.sleep(100); + if (t.hasTimedOut()) { + fail("Timed out waiting for collection " + collection + " to be deleted."); + } + cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collection); + } catch(SolrException e) { + return; + } + + } + } + + private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int numPassive, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { + if (updateCollection) { + cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collectionName); + } + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); + assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, + docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, + docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of active replicas: " + docCollection, numActive, + docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + return docCollection; + } + + /* + * passes only if all replicas are active or down, and the "liveNodes" reflect the same status + */ + private CollectionStatePredicate clusterStateReflectsActiveAndDownReplicas() { + return (liveNodes, collectionState) -> { + for (Replica r:collectionState.getReplicas()) { + if (r.getState() != Replica.State.DOWN && r.getState() != Replica.State.ACTIVE) { + return false; + } + if (r.getState() == Replica.State.DOWN && liveNodes.contains(r.getNodeName())) { + return false; + } + if (r.getState() == Replica.State.ACTIVE && !liveNodes.contains(r.getNodeName())) { + return false; + } + } + return true; + }; + } + + + private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive, int numPassive) { + return (liveNodes, collectionState) -> { + int writersFound = 0, activesFound = 0, passivesFound = 0; + if (collectionState == null) + return false; + for (Slice slice : collectionState) { + for (Replica replica : slice) { + if (replica.isActive(liveNodes)) + switch (replica.getType()) { + case APPEND: + activesFound++; + break; + case PASSIVE: + passivesFound++; + break; + case REALTIME: + writersFound++; + break; + default: + throw new AssertionError("Unexpected replica type"); + } + } + } + return numWriter == writersFound && numActive == activesFound && numPassive == passivesFound; + }; + } + + private void addDocs(int numDocs) throws SolrServerException, IOException { + List docs = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + docs.add(new SolrInputDocument("id", String.valueOf(i), "fieldName_s", String.valueOf(i))); + } + cluster.getSolrClient().add(collectionName, docs); + cluster.getSolrClient().commit(collectionName); + } +} diff --git a/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java index 1bba523c7948..ba92a02323bb 100644 --- a/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java @@ -44,8 +44,8 @@ public static void setupClass() throws Exception { } @Override - protected int getRealtimeReplicas() { - return -1; + protected boolean useAppendReplicas() { + return false; } @AfterClass diff --git a/solr/core/src/test/org/apache/solr/metrics/reporters/solr/SolrCloudReportersTest.java b/solr/core/src/test/org/apache/solr/metrics/reporters/solr/SolrCloudReportersTest.java index df7e6428b201..41848178ab21 100644 --- a/solr/core/src/test/org/apache/solr/metrics/reporters/solr/SolrCloudReportersTest.java +++ b/solr/core/src/test/org/apache/solr/metrics/reporters/solr/SolrCloudReportersTest.java @@ -78,7 +78,7 @@ public void testExplicitConfiguration() throws Exception { String coreName = core.getName(); String collectionName = core.getCoreDescriptor().getCollectionName(); String coreNodeName = core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName(); - String replicaName = coreName.split("_")[3]; + String replicaName = coreName.substring(coreName.indexOf("_replica_") + 1); String shardId = core.getCoreDescriptor().getCloudDescriptor().getShardId(); assertEquals("solr.core." + collectionName + "." + shardId + "." + replicaName, registryName); diff --git a/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java b/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java index f9f377cd9e3d..ebb758d5ede4 100644 --- a/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java +++ b/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java @@ -111,8 +111,8 @@ public static void beforeSuperClass() throws Exception { } @Override - protected int getRealtimeReplicas() { - return onlyLeaderIndexes? 1 : -1; + protected boolean useAppendReplicas() { + return onlyLeaderIndexes; } @After diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index ac388d2c9cd4..0f062f505a1b 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -25,6 +25,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -896,7 +897,7 @@ private Map> buildUrlMap(DocCollection col) { String url = zkProps.getCoreUrl(); urls.add(url); if (!directUpdatesToLeadersOnly) { - for (Replica replica : slice.getReplicas()) { + for (Replica replica : slice.getReplicas(EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME))) { if (!replica.getNodeName().equals(leader.getNodeName()) && !replica.getName().equals(leader.getName())) { ZkCoreNodeProps zkProps1 = new ZkCoreNodeProps(replica); diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java index ec43e11f7cb5..b248f9253b95 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java @@ -16,6 +16,8 @@ */ package org.apache.solr.client.solrj.request; +import static org.apache.solr.common.params.CollectionAdminParams.COUNT_PROP; + import java.io.IOException; import java.util.Collection; import java.util.Map; @@ -34,6 +36,7 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.ImplicitDocRouter; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CollectionParams.CollectionAction; @@ -45,8 +48,6 @@ import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.NamedList; -import static org.apache.solr.common.params.CollectionAdminParams.COUNT_PROP; - /** * This class is experimental and subject to change. * @@ -316,6 +317,19 @@ public SolrParams getParams() { /** Specific Collection API call implementations **/ + /** + * Returns a SolrRequest for creating a collection + * @param collection the collection name + * @param config the collection config + * @param numShards the number of shards in the collection + * @param numRealtimeReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#REALTIME} replicas + * @param numAppendReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#APPEND} replicas + * @param numPassiveReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} replicas + */ + public static Create createCollection(String collection, String config, int numShards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) { + return new Create(collection, config, numShards, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas); + } + /** * Returns a SolrRequest for creating a collection * @param collection the collection name @@ -324,7 +338,7 @@ public SolrParams getParams() { * @param numReplicas the replication factor of the collection */ public static Create createCollection(String collection, String config, int numShards, int numReplicas) { - return new Create(collection, config, numShards, numReplicas); + return new Create(collection, config, numShards, numReplicas, 0, 0); } /** @@ -338,7 +352,7 @@ public static Create createCollection(String collection, String config, int numS * @param numReplicas the replication factor of the collection */ public static Create createCollection(String collection, int numShards, int numReplicas) { - return new Create(collection, numShards, numReplicas); + return new Create(collection, null, numShards, numReplicas, 0, 0); } /** @@ -362,39 +376,36 @@ public static class Create extends AsyncCollectionSpecificAdminRequest { protected String routerField; protected Integer numShards; protected Integer maxShardsPerNode; - protected Integer replicationFactor; + protected Integer realtimeReplicas; + protected Integer passiveReplicas; + protected Integer appendReplicas; private Properties properties; protected Boolean autoAddReplicas; - protected Integer realtimeReplicas; protected Integer stateFormat; private String[] rule , snitch; /** - * @deprecated Use {@link #createCollection(String, String, int, int)} + * @deprecated Use {@link #createCollection(String, String, int, int, int, int)} */ @Deprecated public Create() { super(CollectionAction.CREATE, null); } - - private Create(String collection, String config, int numShards, int numReplicas) { + + private Create(String collection, String config, int numShards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) { // TODO: maybe add other constructors super(CollectionAction.CREATE, SolrIdentifierValidator.validateCollectionName(collection)); this.configName = config; this.numShards = numShards; - this.replicationFactor = numReplicas; + this.realtimeReplicas = numRealtimeReplicas; + this.passiveReplicas = numPassiveReplicas; + this.appendReplicas = numAppendReplicas; } - private Create(String collection, int numShards, int numReplicas) { - super(CollectionAction.CREATE, SolrIdentifierValidator.validateCollectionName(collection)); - this.numShards = numShards; - this.replicationFactor = numReplicas; - } - - private Create(String collection, String config, String shards, int numReplicas) { + private Create(String collection, String config, String shards, int numRealtimeReplicas) { super(CollectionAction.CREATE, SolrIdentifierValidator.validateCollectionName(collection)); this.configName = config; - this.replicationFactor = numReplicas; + this.realtimeReplicas = numRealtimeReplicas; this.shards = shards; this.routerName = ImplicitDocRouter.NAME; } @@ -409,8 +420,10 @@ private Create(String collection, String config, String shards, int numReplicas) public Create setMaxShardsPerNode(Integer numShards) { this.maxShardsPerNode = numShards; return this; } public Create setAutoAddReplicas(boolean autoAddReplicas) { this.autoAddReplicas = autoAddReplicas; return this; } public Create setRealtimeReplicas(Integer realtimeReplicas) { this.realtimeReplicas = realtimeReplicas; return this;} + public Create setAppendReplicas(Integer appendReplicas) { this.appendReplicas = appendReplicas; return this;} + @Deprecated - public Create setReplicationFactor(Integer repl) { this.replicationFactor = repl; return this; } + public Create setReplicationFactor(Integer repl) { this.realtimeReplicas = repl; return this; } public Create setStateFormat(Integer stateFormat) { this.stateFormat = stateFormat; return this; } public Create setRule(String... s){ this.rule = s; return this; } public Create setSnitch(String... s){ this.snitch = s; return this; } @@ -421,9 +434,17 @@ private Create(String collection, String config, String shards, int numReplicas) public String getShards() { return shards; } public Integer getNumShards() { return numShards; } public Integer getMaxShardsPerNode() { return maxShardsPerNode; } - public Integer getReplicationFactor() { return replicationFactor; } + /** + * + * @deprecated Use {@link #getNumRealtimeReplicas()} + */ + @Deprecated + public Integer getReplicationFactor() { return getNumRealtimeReplicas(); } + public Integer getNumRealtimeReplicas() { return realtimeReplicas; } public Boolean getAutoAddReplicas() { return autoAddReplicas; } public Integer getRealtimeReplicas() { return realtimeReplicas; } + public Integer getAppendReplicas() {return appendReplicas;} + public Integer getStateFormat() { return stateFormat; } /** @@ -504,21 +525,25 @@ public SolrParams getParams() { if (routerField != null) { params.set("router.field", routerField); } - if (replicationFactor != null) { - params.set( "replicationFactor", replicationFactor); + if (realtimeReplicas != null) { + params.set( "replicationFactor", realtimeReplicas);// Keep both for compatibility? + params.set( ZkStateReader.REALTIME_REPLICAS, realtimeReplicas); } if (autoAddReplicas != null) { params.set(ZkStateReader.AUTO_ADD_REPLICAS, autoAddReplicas); } - if (realtimeReplicas != null) { - params.set(ZkStateReader.REALTIME_REPLICAS, realtimeReplicas); - } if(properties != null) { addProperties(params, properties); } if (stateFormat != null) { params.set(DocCollection.STATE_FORMAT, stateFormat); } + if (passiveReplicas != null) { + params.set(ZkStateReader.PASSIVE_REPLICAS, passiveReplicas); + } + if (appendReplicas != null) { + params.set(ZkStateReader.APPEND_REPLICAS, appendReplicas); + } if(rule != null) params.set("rule", rule); if(snitch != null) params.set("snitch", snitch); return params; @@ -1615,19 +1640,26 @@ public SolrParams getParams() { } - + /** * Returns a SolrRequest to add a replica to a shard in a collection */ public static AddReplica addReplicaToShard(String collection, String shard) { - return new AddReplica(collection, shard, null); + return addReplicaToShard(collection, shard, Replica.Type.REALTIME); + } + + /** + * Returns a SolrRequest to add a replica of the specified type to a shard in a collection + */ + public static AddReplica addReplicaToShard(String collection, String shard, Replica.Type replicaType) { + return new AddReplica(collection, shard, null, replicaType); } /** * Returns a SolrRequest to add a replica to a collection using a route key */ public static AddReplica addReplicaByRouteKey(String collection, String routeKey) { - return new AddReplica(collection, null, routeKey); + return new AddReplica(collection, null, routeKey, Replica.Type.REALTIME); } // ADDREPLICA request @@ -1640,6 +1672,7 @@ public static class AddReplica extends AsyncCollectionAdminRequest { protected String instanceDir; protected String dataDir; protected Properties properties; + protected Replica.Type type; /** * @deprecated Use {@link #addReplicaByRouteKey(String, String)} or {@link #addReplicaToShard(String, String)} @@ -1649,11 +1682,12 @@ public AddReplica() { super(CollectionAction.ADDREPLICA); } - private AddReplica(String collection, String shard, String routeKey) { + private AddReplica(String collection, String shard, String routeKey, Replica.Type type) { super(CollectionAction.ADDREPLICA); this.collection = collection; this.shard = shard; this.routeKey = routeKey; + this.type = type; } public Properties getProperties() { @@ -1727,6 +1761,11 @@ public AddReplica setAsyncId(String id) { this.asyncId = id; return this; } + + public AddReplica setType(Replica.Type type) { + this.type = type; + return this; + } @Override public SolrParams getParams() { @@ -1752,6 +1791,9 @@ public SolrParams getParams() { if (dataDir != null) { params.add("dataDir", dataDir); } + if (type != null) { + params.add(ZkStateReader.REPLICA_TYPE, type.name()); + } if (properties != null) { addProperties(params, properties); } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java b/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java index d89b2f6418ca..3b409b7f72fa 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java @@ -18,6 +18,7 @@ import java.util.ArrayList; import java.util.Collection; +import java.util.EnumSet; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; @@ -33,7 +34,6 @@ import static org.apache.solr.common.cloud.ZkStateReader.AUTO_ADD_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE; -import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; /** @@ -60,8 +60,6 @@ public class DocCollection extends ZkNodeProps implements Iterable { private final Integer replicationFactor; private final Integer maxShardsPerNode; private final Boolean autoAddReplicas; - private final Integer realtimeReplicas; - public DocCollection(String name, Map slices, Map props, DocRouter router) { this(name, slices, props, router, Integer.MAX_VALUE, ZkStateReader.CLUSTER_STATE); @@ -86,11 +84,7 @@ public DocCollection(String name, Map slices, Map this.maxShardsPerNode = (Integer) verifyProp(props, MAX_SHARDS_PER_NODE); Boolean autoAddReplicas = (Boolean) verifyProp(props, AUTO_ADD_REPLICAS); this.autoAddReplicas = autoAddReplicas == null ? Boolean.FALSE : autoAddReplicas; - Integer realtimeReplicas = (Integer) verifyProp(props, REALTIME_REPLICAS); - this.realtimeReplicas = realtimeReplicas == null ? -1 : realtimeReplicas; - if (this.realtimeReplicas != -1 && this.realtimeReplicas != 1) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid realtimeReplicas must be 1 or -1, found:" + this.realtimeReplicas); - } + verifyProp(props, RULE); verifyProp(props, SNITCH); Iterator> iter = slices.entrySet().iterator(); @@ -133,7 +127,6 @@ public static Object verifyProp(Map props, String propName) { switch (propName) { case MAX_SHARDS_PER_NODE: case REPLICATION_FACTOR: - case REALTIME_REPLICAS: return Integer.parseInt(o.toString()); case AUTO_ADD_REPLICAS: return Boolean.parseBoolean(o.toString()); @@ -234,10 +227,6 @@ public int getMaxShardsPerNode() { return maxShardsPerNode; } - public int getRealtimeReplicas() { - return realtimeReplicas; - } - public String getZNode(){ return znode; } @@ -311,6 +300,14 @@ public List getReplicas() { } return replicas; } + + public List getReplicas(EnumSet s) { + List replicas = new ArrayList<>(); + for (Slice slice : this) { + replicas.addAll(slice.getReplicas(s)); + } + return replicas; + } /** * Get the shardId of a core on a specific node diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java index 4968cf2befd7..8f3ed15b5435 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java @@ -84,9 +84,31 @@ public static State getState(String stateStr) { } } + public enum Type { + /** + * Writes updates to transaction log and indexes locally. Replicas of type {@link #REALTIME} support NRT (soft commits) and RTG. + * Any {@link #REALTIME} replica can become a leader. A shard leader will forward updates to all active {@link #REALTIME} and + * {@link #APPEND} replicas. + */ + REALTIME, + /** + * Writes to transaction log, but not to index, uses replication. Any {@link #APPEND} replica can become leader (by first + * applying all local transaction log elements). If a replica is of type {@link #APPEND} but is also the leader, it will behave + * as a {@link #REALTIME}. A shard leader will forward updates to all active {@link #REALTIME} and {@link #APPEND} replicas. + */ + APPEND, + /** + * Doesn’t index or writes to transaction log. Just replicates from {@link #REALTIME} or {@link #APPEND} replicas. {@link #PASSIVE} + * replicas can’t become shard leaders (i.e., if there are only passive replicas in the collection at some point, updates will fail + * same as if there is no leaders, queries continue to work), so they don’t even participate in elections. + */ + PASSIVE + } + private final String name; private final String nodeName; private final State state; + private final Type type; public Replica(String name, Map propMap) { super(propMap); @@ -98,6 +120,12 @@ public Replica(String name, Map propMap) { this.state = State.ACTIVE; //Default to ACTIVE propMap.put(ZkStateReader.STATE_PROP, state.toString()); } + String typeString = (String)propMap.get(ZkStateReader.REPLICA_TYPE); + if (typeString == null) { + this.type = Type.REALTIME; + } else { + this.type = Type.valueOf(typeString); + } } @@ -129,6 +157,10 @@ public State getState() { public boolean isActive(Set liveNodes) { return liveNodes.contains(this.nodeName) && this.state == State.ACTIVE; } + + public Type getType() { + return this.type; + } @Override public String toString() { diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java index bd3bafdba2c0..2cd716c25775 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java @@ -18,6 +18,7 @@ import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; @@ -27,6 +28,7 @@ import java.util.function.Predicate; import java.util.stream.Collectors; +import org.apache.solr.common.cloud.Replica.Type; import org.noggit.JSONUtil; import org.noggit.JSONWriter; @@ -161,7 +163,7 @@ public Slice(String name, Map replicas, Map props // add the replicas *after* the other properties (for aesthetics, so it's easy to find slice properties in the JSON output) this.replicas = replicas != null ? replicas : makeReplicas((Map)propMap.get(REPLICAS)); propMap.put(REPLICAS, this.replicas); - + Map rules = (Map) propMap.get("routingRules"); if (rules != null) { this.routingRules = new HashMap<>(); @@ -202,7 +204,10 @@ private Map makeReplicas(Map genericReplicas) { private Replica findLeader() { for (Replica replica : replicas.values()) { - if (replica.getStr(LEADER) != null) return replica; + if (replica.getStr(LEADER) != null) { + assert replica.getType() == Type.APPEND || replica.getType() == Type.REALTIME; + return replica; + } } return null; } @@ -215,7 +220,7 @@ public String getName() { } /** - * Gets the list of replicas for this slice. + * Gets the list of all replicas for this slice. */ public Collection getReplicas() { return replicas.values(); @@ -227,6 +232,13 @@ public Collection getReplicas() { public List getReplicas(Predicate pred) { return replicas.values().stream().filter(pred).collect(Collectors.toList()); } + + /** + * Gets the list of replicas that have a state present in s + */ + public List getReplicas(EnumSet s) { + return this.getReplicas(r->s.contains(r.getType())); + } /** * Get the map of coreNodeName to replicas for this slice. @@ -238,7 +250,7 @@ public Map getReplicasMap() { public Map getReplicasCopy() { return new LinkedHashMap<>(replicas); } - + public Replica getLeader() { return leader; } @@ -272,4 +284,5 @@ public String toString() { public void write(JSONWriter jsonWriter) { jsonWriter.write(propMap); } + } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index 7cefbee733e7..af7ab2618cd6 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; @@ -39,6 +40,7 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; import org.apache.solr.common.Callable; import org.apache.solr.common.SolrException; @@ -92,11 +94,18 @@ public class ZkStateReader implements Closeable { public static final String REJOIN_AT_HEAD_PROP = "rejoinAtHead"; public static final String SOLR_SECURITY_CONF_PATH = "/security.json"; + /** + *@deprecated Use {@link #REALTIME_REPLICAS} + */ + @Deprecated public static final String REPLICATION_FACTOR = "replicationFactor"; public static final String MAX_SHARDS_PER_NODE = "maxShardsPerNode"; public static final String AUTO_ADD_REPLICAS = "autoAddReplicas"; public static final String MAX_CORES_PER_NODE = "maxCoresPerNode"; + //TODO: Move these constants out of ZkStateReader + public static final String PASSIVE_REPLICAS = "passiveReplicas"; public static final String REALTIME_REPLICAS = "realtimeReplicas"; + public static final String APPEND_REPLICAS = "appendReplicas"; public static final String ROLES = "/roles.json"; @@ -106,6 +115,8 @@ public class ZkStateReader implements Closeable { public static final String LEGACY_CLOUD = "legacyCloud"; public static final String URL_SCHEME = "urlScheme"; + + public static final String REPLICA_TYPE = "type"; /** A view of the current state of all collections; combines all the different state sources into a single view. */ @@ -780,6 +791,12 @@ public List getReplicaProps(String collection, String shardId, public List getReplicaProps(String collection, String shardId, String thisCoreNodeName, Replica.State mustMatchStateFilter, Replica.State mustNotMatchStateFilter) { + //nocommit + return getReplicaProps(collection, shardId, thisCoreNodeName, mustMatchStateFilter, null, EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)); + } + + public List getReplicaProps(String collection, String shardId, String thisCoreNodeName, + Replica.State mustMatchStateFilter, Replica.State mustNotMatchStateFilter, final EnumSet acceptReplicaType) { assert thisCoreNodeName != null; ClusterState clusterState = this.clusterState; if (clusterState == null) { @@ -798,7 +815,7 @@ public List getReplicaProps(String collection, String shardId, Map shardMap = replicas.getReplicasMap(); List nodes = new ArrayList<>(shardMap.size()); - for (Entry entry : shardMap.entrySet()) { + for (Entry entry : shardMap.entrySet().stream().filter((e)->acceptReplicaType.contains(e.getValue().getType())).collect(Collectors.toList())) { ZkCoreNodeProps nodeProps = new ZkCoreNodeProps(entry.getValue()); String coreNodeName = entry.getValue().getName(); diff --git a/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java b/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java index f3e0d7e9d0f2..e64892dc6191 100644 --- a/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java +++ b/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java @@ -128,6 +128,11 @@ public abstract class CoreAdminParams */ public static final String NEW_COLLECTION = "newCollection"; + /** + * Tells the CoreAdminHandler that the new Core will be a replica of this type. + */ + public static final String REPLICA_TYPE = "replicaType"; + public enum CoreAdminAction { STATUS(true), UNLOAD, diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java index 50f2d18a15f9..7a89decfc7e6 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java @@ -23,6 +23,8 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.ShardParams; import org.apache.solr.common.params.SolrParams; @@ -74,6 +76,13 @@ public void testAddReplica() { .setRouteKey("route") .setCollectionName("collection"); assertContainsParams(request.getParams(), ACTION, COLLECTION, ShardParams._ROUTE_); + + // with type parameter + request = new CollectionAdminRequest.AddReplica() + .setShardName("shard") + .setCollectionName("collection") + .setType(Replica.Type.REALTIME); + assertContainsParams(request.getParams(), ACTION, COLLECTION, SHARD, ZkStateReader.REPLICA_TYPE); } public void testAddReplicaProp() { diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java index 54ab06d59ea7..c94f24c8a49c 100644 --- a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java @@ -776,8 +776,8 @@ public void preTearDown() { * is set. */ public static void deleteCore() { - log.info("###deleteCore" ); if (h != null) { + log.info("###deleteCore" ); // If the test case set up Zk, it should still have it as available, // otherwise the core close will just be unnecessarily delayed. CoreContainer cc = h.getCoreContainer(); diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java index 7141eedbc991..e5d261968973 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java @@ -258,6 +258,7 @@ public static void verifyReplicaStatus(ZkStateReader reader, String collection, int maxIterations = 100; Replica.State coreState = null; while(maxIterations-->0) { + System.out.println("ClusterState" + reader.getClusterState()); Slice slice = reader.getClusterState().getSlice(collection, shard); if(slice!=null) { Replica replica = slice.getReplicasMap().get(coreNodeName); @@ -266,6 +267,8 @@ public static void verifyReplicaStatus(ZkStateReader reader, String collection, if(coreState == expectedState) { return; } + } else { + System.out.println(slice.getReplicasMap()); } } Thread.sleep(50); diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java index 48f7670f01a1..e9e0f3d0292d 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java @@ -272,8 +272,8 @@ protected void initCloud() throws Exception { shardToJetty, shardToLeaderJetty); } - protected int getRealtimeReplicas() { - return -1; + protected boolean useAppendReplicas() { + return false; } protected CloudSolrClient createCloudClient(String defaultCollection) { @@ -387,7 +387,7 @@ protected List createJettys(int numJettys) throws Exception { CollectionParams.CollectionAction.CREATE.toLower(), "name", DEFAULT_COLLECTION, "numShards", String.valueOf(sliceCount), DocCollection.STATE_FORMAT, getStateFormat(), - ZkStateReader.REALTIME_REPLICAS, getRealtimeReplicas()))); + ZkStateReader.REALTIME_REPLICAS, useAppendReplicas()))); zkClient.close(); } @@ -1570,11 +1570,24 @@ protected CollectionAdminResponse createCollection(Map> co String shardNames = (String) collectionProps.get(SHARDS_PROP); numShards = StrUtils.splitSmart(shardNames,',').size(); } - Integer replicationFactor = (Integer) collectionProps.get(ZkStateReader.REPLICATION_FACTOR); - if(replicationFactor==null){ - replicationFactor = (Integer) OverseerCollectionMessageHandler.COLL_PROPS.get(ZkStateReader.REPLICATION_FACTOR); + Integer numRealtimeReplicas = (Integer) collectionProps.get(ZkStateReader.REALTIME_REPLICAS); + if (numRealtimeReplicas == null) { + numRealtimeReplicas = (Integer) collectionProps.get(ZkStateReader.REPLICATION_FACTOR); + } + if(numRealtimeReplicas == null){ + numRealtimeReplicas = (Integer) OverseerCollectionMessageHandler.COLL_PROPS.get(ZkStateReader.REPLICATION_FACTOR); + } + if (numRealtimeReplicas == null) { + numRealtimeReplicas = Integer.valueOf(0); + } + Integer numAppendReplicas = (Integer) collectionProps.get(ZkStateReader.APPEND_REPLICAS); + if (numAppendReplicas == null) { + numAppendReplicas = Integer.valueOf(0); + } + Integer numPassiveReplicas = (Integer) collectionProps.get(ZkStateReader.PASSIVE_REPLICAS); + if (numPassiveReplicas == null) { + numPassiveReplicas = Integer.valueOf(0); } - if (confSetName != null) { params.set("collection.configName", confSetName); } @@ -1582,7 +1595,7 @@ protected CollectionAdminResponse createCollection(Map> co int clientIndex = random().nextInt(2); List list = new ArrayList<>(); list.add(numShards); - list.add(replicationFactor); + list.add(numRealtimeReplicas + numAppendReplicas + numPassiveReplicas); if (collectionInfos != null) { collectionInfos.put(collectionName, list); } @@ -1610,26 +1623,30 @@ protected CollectionAdminResponse createCollection(Map> co protected CollectionAdminResponse createCollection(Map> collectionInfos, String collectionName, int numShards, int replicationFactor, int maxShardsPerNode, SolrClient client, String createNodeSetStr) throws SolrServerException, IOException { + int numRealtimeReplicas = useAppendReplicas()?0:replicationFactor; + int numAppendReplicas = useAppendReplicas()?replicationFactor:0; return createCollection(collectionInfos, collectionName, Utils.makeMap( NUM_SLICES, numShards, - ZkStateReader.REPLICATION_FACTOR, replicationFactor, + ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, + ZkStateReader.APPEND_REPLICAS, numAppendReplicas, CREATE_NODE_SET, createNodeSetStr, - ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode, - ZkStateReader.REALTIME_REPLICAS, getRealtimeReplicas()), + ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode), client); } protected CollectionAdminResponse createCollection(Map> collectionInfos, String collectionName, int numShards, int replicationFactor, int maxShardsPerNode, SolrClient client, String createNodeSetStr, String configName) throws SolrServerException, IOException { + int numRealtimeReplicas = useAppendReplicas()?0:replicationFactor; + int numAppendReplicas = useAppendReplicas()?replicationFactor:0; return createCollection(collectionInfos, collectionName, Utils.makeMap( NUM_SLICES, numShards, - ZkStateReader.REPLICATION_FACTOR, replicationFactor, + ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, + ZkStateReader.APPEND_REPLICAS, numAppendReplicas, CREATE_NODE_SET, createNodeSetStr, - ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode, - ZkStateReader.REALTIME_REPLICAS, getRealtimeReplicas()), + ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode), client, configName); } @@ -1808,11 +1825,12 @@ protected void createCollection(String collName, int numShards ) throws Exception { int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrClient() .getZkStateReader().getClusterState().getLiveNodes().size())) + 1; - + int numRealtimeReplicas = useAppendReplicas()?0:replicationFactor; + int numAppendReplicas = useAppendReplicas()?replicationFactor:0; Map props = makeMap( - ZkStateReader.REPLICATION_FACTOR, replicationFactor, ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode, - ZkStateReader.REALTIME_REPLICAS, getRealtimeReplicas(), + ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, + ZkStateReader.APPEND_REPLICAS, numAppendReplicas, NUM_SLICES, numShards); Map> collectionInfos = new HashMap<>(); createCollection(collectionInfos, collName, props, client); diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index 5cae35623d9f..2ad42d164616 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -187,7 +187,7 @@ public void stopJetty(JettySolrRunner jetty) throws Exception { private static void stopJettySolrRunner(JettySolrRunner jetty) throws Exception { assert(jetty != null); - monkeyLog("stop shard! " + jetty.getLocalPort()); + monkeyLog("stop jetty! " + jetty.getLocalPort()); SolrDispatchFilter sdf = jetty.getSolrDispatchFilter(); if (sdf != null) { try { @@ -231,7 +231,7 @@ public static void kill(JettySolrRunner jetty) throws Exception { IpTables.blockPort(jetty.getLocalPort()); - monkeyLog("kill shard! " + jetty.getLocalPort()); + monkeyLog("kill jetty! " + jetty.getLocalPort()); jetty.stop(); From a217dfaaf43950fb229b088745d6207ce5106b6e Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 25 Apr 2017 17:10:41 -0700 Subject: [PATCH 02/41] Added error handling tests for Passive Replicas --- .../TestPassiveReplicaErrorHandling.java | 331 ++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100644 solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java new file mode 100644 index 000000000000..cbc3f2aeaa39 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.net.URL; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import org.apache.solr.SolrTestCaseJ4.SuppressSSL; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.CollectionStatePredicate; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.util.TestInjection; +import org.apache.solr.util.TimeOut; +import org.apache.zookeeper.KeeperException; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") +public class TestPassiveReplicaErrorHandling extends SolrCloudTestCase { + + private final static int REPLICATION_TIMEOUT_SECS = 10; + + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static Map proxies; + private static Map jettys; + + private String collectionName = null; + + private String suggestedCollectionName() { + return (getTestClass().getSimpleName().replace("Test", "") + "_" + getTestName().split(" ")[0]).replaceAll("(.)(\\p{Upper})", "$1_$2").toLowerCase(Locale.ROOT); + } + + @BeforeClass + public static void setupCluster() throws Exception { + TestInjection.waitForReplicasInSync = null; // We'll be explicit about this in this test + configureCluster(4) + .addConfig("conf", configset("cloud-minimal")) + .configure(); + // Add proxies + proxies = new HashMap<>(cluster.getJettySolrRunners().size()); + jettys = new HashMap<>(cluster.getJettySolrRunners().size()); + for (JettySolrRunner jetty:cluster.getJettySolrRunners()) { + SocketProxy proxy = new SocketProxy(); + jetty.setProxyPort(proxy.getListenPort()); + cluster.stopJettySolrRunner(jetty);//TODO: Can we avoid this restart + cluster.startJettySolrRunner(jetty); + proxy.open(jetty.getBaseUrl().toURI()); + LOG.info("Adding proxy for URL: " + jetty.getBaseUrl() + ". Proxy: " + proxy.getUrl()); + proxies.put(proxy.getUrl(), proxy); + jettys.put(proxy.getUrl(), jetty); + } + TimeOut t = new TimeOut(10, TimeUnit.SECONDS); + while (true) { + try { + CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false"); + CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient()); + assertEquals(0, response.getStatus()); + break; + } catch (SolrServerException e) { + Thread.sleep(50); + if (t.hasTimedOut()) { + throw e; + } + } + } + } + + @AfterClass + public static void tearDownCluster() throws Exception { +// cluster.shutdown(); +// cluster = null; + for (SocketProxy proxy:proxies.values()) { + proxy.close(); + } + proxies = null; + jettys = null; + } + + @Override + public void setUp() throws Exception { + super.setUp(); + collectionName = suggestedCollectionName(); + expectThrows(SolrException.class, () -> getCollectionState(collectionName)); + cluster.getSolrClient().setDefaultCollection(collectionName); + } + + @Override + public void tearDown() throws Exception { + if (cluster.getSolrClient().getZkStateReader().getClusterState().getCollectionOrNull(collectionName) != null) { + LOG.info("tearDown deleting collection"); + CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); + LOG.info("Collection deleted"); + waitForDeletion(collectionName); + } + collectionName = null; + super.tearDown(); + } + +// @Repeat(iterations=10) + public void testCantConnectToPassiveReplica() throws Exception { + int numShards = 2; + CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1, 0, 1) + .setMaxShardsPerNode(1) + .process(cluster.getSolrClient()); + addDocs(10); + DocCollection docCollection = assertNumberOfReplicas(numShards, 0, numShards, false, true); + Slice s = docCollection.getSlices().iterator().next(); + SocketProxy proxy = getProxyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0)); + try { + proxy.close(); + for (int i = 1; i <= 10; i ++) { + addDocs(10 + i); + try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) { + assertNumDocs(10 + i, leaderClient); + } + } + try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + passiveReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound(); + fail("Shouldn't be able to query the passive replica"); + } catch (SolrServerException e) { + //expected + } + assertNumberOfReplicas(numShards, 0, numShards, true, true);// Replica should still be active, since it doesn't disconnect from ZooKeeper + { + long numFound = 0; + TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS); + while (numFound < 20 && !t.hasTimedOut()) { + Thread.sleep(200); + numFound = cluster.getSolrClient().query(collectionName, new SolrQuery("*:*")).getResults().getNumFound(); + } + } + } finally { + proxy.reopen(); + } + + try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + assertNumDocs(20, passiveReplicaClient); + } + } + + public void testCantConnectToLeader() throws Exception { + int numShards = 1; + CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1, 0, 1) + .setMaxShardsPerNode(1) + .process(cluster.getSolrClient()); + addDocs(10); + DocCollection docCollection = assertNumberOfReplicas(numShards, 0, numShards, false, true); + Slice s = docCollection.getSlices().iterator().next(); + SocketProxy proxy = getProxyForReplica(s.getLeader()); + try { + // wait for replication + try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + assertNumDocs(10, passiveReplicaClient); + } + proxy.close(); + expectThrows(SolrException.class, ()->addDocs(1)); + try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + assertNumDocs(10, passiveReplicaClient); + } + assertNumDocs(10, cluster.getSolrClient()); + } finally { + proxy.reopen(); + } + } + + public void testPassiveReplicaDisconnectsFromZooKeeper() throws Exception { + int numShards = 1; + CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1, 0, 1) + .setMaxShardsPerNode(1) + .process(cluster.getSolrClient()); + addDocs(10); + DocCollection docCollection = assertNumberOfReplicas(numShards, 0, numShards, false, true); + Slice s = docCollection.getSlices().iterator().next(); + try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + assertNumDocs(10, passiveReplicaClient); + } + addDocs(20); + JettySolrRunner jetty = getJettyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0)); + cluster.expireZkSession(jetty); + addDocs(30); + waitForState("Expecting node to be disconnected", collectionName, activeReplicaCount(1, 0, 0)); + addDocs(40); + waitForState("Expecting node to be disconnected", collectionName, activeReplicaCount(1, 0, 1)); + try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + assertNumDocs(40, passiveReplicaClient); + } + } + + + private void assertNumDocs(int numDocs, SolrClient client) throws InterruptedException, SolrServerException, IOException { + TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS); + long numFound = -1; + while (!t.hasTimedOut()) { + Thread.sleep(200); + numFound = client.query(new SolrQuery("*:*")).getResults().getNumFound(); + if (numFound == numDocs) { + return; + } + } + fail("Didn't get expected doc count. Expected: " + numDocs + ", Found: " + numFound); + } + + private void addDocs(int numDocs) throws SolrServerException, IOException { + List docs = new ArrayList<>(numDocs); + for (int i = 0; i < numDocs; i++) { + docs.add(new SolrInputDocument("id", String.valueOf(i), "fieldName_s", String.valueOf(i))); + } + cluster.getSolrClient().add(collectionName, docs); + cluster.getSolrClient().commit(collectionName); + } + + private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int numPassive, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { + if (updateCollection) { + cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collectionName); + } + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); + assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, + docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, + docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of active replicas: " + docCollection, numActive, + docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + return docCollection; + } + + protected JettySolrRunner getJettyForReplica(Replica replica) throws Exception { + String replicaBaseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP); + assertNotNull(replicaBaseUrl); + URL baseUrl = new URL(replicaBaseUrl); + + JettySolrRunner proxy = jettys.get(baseUrl.toURI()); + assertNotNull("No proxy found for " + baseUrl + "!", proxy); + return proxy; + } + + protected SocketProxy getProxyForReplica(Replica replica) throws Exception { + String replicaBaseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP); + assertNotNull(replicaBaseUrl); + URL baseUrl = new URL(replicaBaseUrl); + + SocketProxy proxy = proxies.get(baseUrl.toURI()); + if (proxy == null && !baseUrl.toExternalForm().endsWith("/")) { + baseUrl = new URL(baseUrl.toExternalForm() + "/"); + proxy = proxies.get(baseUrl.toURI()); + } + assertNotNull("No proxy found for " + baseUrl + "!", proxy); + return proxy; + } + + private void waitForDeletion(String collection) throws InterruptedException, KeeperException { + TimeOut t = new TimeOut(10, TimeUnit.SECONDS); + while (cluster.getSolrClient().getZkStateReader().getClusterState().hasCollection(collection)) { + LOG.info("Collection not yet deleted"); + try { + Thread.sleep(100); + if (t.hasTimedOut()) { + fail("Timed out waiting for collection " + collection + " to be deleted."); + } + cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collection); + } catch(SolrException e) { + return; + } + + } + } + + private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive, int numPassive) { + return (liveNodes, collectionState) -> { + int writersFound = 0, activesFound = 0, passivesFound = 0; + if (collectionState == null) + return false; + for (Slice slice : collectionState) { + for (Replica replica : slice) { + if (replica.isActive(liveNodes)) + switch (replica.getType()) { + case APPEND: + activesFound++; + break; + case PASSIVE: + passivesFound++; + break; + case REALTIME: + writersFound++; + break; + default: + throw new AssertionError("Unexpected replica type"); + } + } + } + return numWriter == writersFound && numActive == activesFound && numPassive == passivesFound; + }; + } + +} From 0330b4abe5785e509b29d3bc7f461c4e57d153f7 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 26 Apr 2017 16:21:40 -0700 Subject: [PATCH 03/41] Sometimes use legacyCloud in tests --- .../core/src/java/org/apache/solr/cloud/ZkController.java | 1 + .../src/test/org/apache/solr/cloud/TestAppendReplica.java | 4 +++- .../test/org/apache/solr/cloud/TestPassiveReplica.java | 8 +++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 55dd9a0ad81e..b239dd2445dc 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -1230,6 +1230,7 @@ public void publish(final CoreDescriptor cd, final Replica.State state, boolean props.put(ZkStateReader.NODE_NAME_PROP, getNodeName()); props.put(ZkStateReader.SHARD_ID_PROP, cd.getCloudDescriptor().getShardId()); props.put(ZkStateReader.COLLECTION_PROP, collection); + props.put(ZkStateReader.REPLICA_TYPE, cd.getCloudDescriptor().getReplicaType().toString()); if (numShards != null) { props.put(ZkStateReader.NUM_SHARDS_PROP, numShards.toString()); } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java index fe353d25bdd7..a7f30427a793 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java @@ -82,7 +82,9 @@ public static void setupCluster() throws Exception { configureCluster(2) // 2 + random().nextInt(3) .addConfig("conf", configset("cloud-minimal-inplace-updates")) .configure(); - CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false"); + Boolean useLegacyCloud = rarely(); + LOG.info("Using legacyCloud?: {}", useLegacyCloud); + CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, String.valueOf(useLegacyCloud)); CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient()); assertEquals(0, response.getStatus()); } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java index 80cc25e6c5b9..e158a92ee1cb 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java @@ -77,7 +77,9 @@ public static void setupCluster() throws Exception { configureCluster(2) // 2 + random().nextInt(3) .addConfig("conf", configset("cloud-minimal")) .configure(); - CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false"); + Boolean useLegacyCloud = rarely(); + LOG.info("Using legacyCloud?: {}", useLegacyCloud); + CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, String.valueOf(useLegacyCloud)); CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient()); assertEquals(0, response.getStatus()); } @@ -287,10 +289,6 @@ public void testPassiveReplicaStates() throws Exception { assertEquals("Expecting DOWN->RECOVERING->ACTIVE but saw: " + Arrays.toString(statesSeen.toArray()), Replica.State.ACTIVE, statesSeen.get(0)); } - public void testPassiveReplicaCantConnectToZooKeeper() { - - } - public void testRealTimeGet() { // should be redirected to writers } From 304add6f631494d28d952431055e89b8357c6a5a Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 26 Apr 2017 16:28:25 -0700 Subject: [PATCH 04/41] Added ChaosMonkey tests with safe leader for passive replicas --- .../java/org/apache/solr/cloud/Assign.java | 1 + .../apache/solr/cloud/CloudDescriptor.java | 3 +- .../solr/cloud/CreateCollectionCmd.java | 2 - solr/core/src/test-files/log4j.properties | 2 + ...nkeySafeLeaderWithPassiveReplicasTest.java | 229 ++++++++++++++++++ .../TestPassiveReplicaErrorHandling.java | 2 - .../solr/BaseDistributedSearchTestCase.java | 1 + .../cloud/AbstractFullDistribZkTestBase.java | 63 +++-- 8 files changed, 284 insertions(+), 19 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java diff --git a/solr/core/src/java/org/apache/solr/cloud/Assign.java b/solr/core/src/java/org/apache/solr/cloud/Assign.java index ca784e554d4d..924ff171bffd 100644 --- a/solr/core/src/java/org/apache/solr/cloud/Assign.java +++ b/solr/core/src/java/org/apache/solr/cloud/Assign.java @@ -109,6 +109,7 @@ public static String assignShard(DocCollection collection, Integer numShards) { } public static String buildCoreName(String collectionName, String shard, Replica.Type type, int replicaNum) { + // TODO: Adding the suffix is great for debugging, but may be an issue if at some point we want to support a way to change replica type return collectionName + "_" + shard + "_replica_" + type.name().substring(0,1).toLowerCase() + replicaNum; } diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java index ff29afc60b50..c39272c42f61 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java +++ b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java @@ -64,7 +64,8 @@ public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { if (Strings.isNullOrEmpty(nodeName)) this.nodeName = null; this.numShards = PropertiesUtil.toInteger(props.getProperty(CloudDescriptor.NUM_SHARDS), null); - this.replicaType = Replica.Type.valueOf(props.getProperty(CloudDescriptor.REPLICA_TYPE, Replica.Type.REALTIME.name())); + System.out.println("ReplicaType: " + props); + this.replicaType = Replica.Type.valueOf(props.getProperty(CloudDescriptor.REPLICA_TYPE, Replica.Type.REALTIME.toString())); for (String propName : props.stringPropertyNames()) { if (propName.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) { collectionParams.put(propName.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), props.getProperty(propName)); diff --git a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java index 41b842057631..4b6971ee4659 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java @@ -209,8 +209,6 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul for (Map.Entry e : positionVsNodes.entrySet()) { ReplicaAssigner.Position position = e.getKey(); String nodeName = e.getValue(); - // TODO: Adding the suffix is great for debugging, but may be an issue if at some point we want to support a way to change replica type -// String coreName = collectionName + "_" + position.shard + "_replica" + position.suffix + (position.index + 1); String coreName = Assign.buildCoreName(collectionName, position.shard, position.type, position.index + 1); log.debug(formatString("Creating core {0} as part of shard {1} of collection {2} on {3}" , coreName, position.shard, collectionName, nodeName)); diff --git a/solr/core/src/test-files/log4j.properties b/solr/core/src/test-files/log4j.properties index c464a9fd9def..73f75532ec3e 100644 --- a/solr/core/src/test-files/log4j.properties +++ b/solr/core/src/test-files/log4j.properties @@ -1,4 +1,5 @@ # Logging level +# nocommit: revert this file before back to master log4j.rootLogger=DEBUG, CONSOLE log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender @@ -16,6 +17,7 @@ log4j.logger.org.apache.solr.cloud.OverseerTaskQueue=INFO log4j.logger.org.apache.solr.common.cloud.SolrZkClient=INFO log4j.logger.org.apache.solr.util.stats.InstrumentedPoolingHttpClientConnectionManager=INFO log4j.logger.com.codehale.metrics=INFO +log4j.logger.com.codahale.metrics.JmxReporter=INFO #log4j.logger.org.apache.solr.update.processor.LogUpdateProcessorFactory=DEBUG #log4j.logger.org.apache.solr.update.processor.DistributedUpdateProcessor=DEBUG #log4j.logger.org.apache.solr.update.PeerSync=DEBUG diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java new file mode 100644 index 000000000000..5fee6049f8b7 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Slow +public class ChaosMonkeySafeLeaderWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static final Integer RUN_LENGTH = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.runlength", "-1")); + + private final int numPassiveReplicas; + private final int numRealtimeOrAppendReplicas; + + protected int getPassiveReplicaCount() { + return numPassiveReplicas; + } + + @BeforeClass + public static void beforeSuperClass() { + schemaString = "schema15.xml"; // we need a string id + System.setProperty("solr.autoCommit.maxTime", "15000"); + setErrorHook(); + } + + @AfterClass + public static void afterSuperClass() { + System.clearProperty("solr.autoCommit.maxTime"); + clearErrorHook(); + } + + protected static final String[] fieldNames = new String[]{"f_i", "f_f", "f_d", "f_l", "f_dt"}; + protected static final RandVal[] randVals = new RandVal[]{rint, rfloat, rdouble, rlong, rdate}; + + public String[] getFieldNames() { + return fieldNames; + } + + public RandVal[] getRandValues() { + return randVals; + } + + @Override + public void distribSetUp() throws Exception { + useFactory("solr.StandardDirectoryFactory"); + super.distribSetUp(); + } + + public ChaosMonkeySafeLeaderWithPassiveReplicasTest() { + super(); + numPassiveReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; + numRealtimeOrAppendReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; + sliceCount = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.slicecount", "-1")); + if (sliceCount == -1) { + sliceCount = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1; + } + + int numNodes = sliceCount * (numRealtimeOrAppendReplicas + numPassiveReplicas); + fixShardCount(numNodes); + log.info("Starting ChaosMonkey test with {} shards and {} nodes", sliceCount, numNodes); + } + + @Test + public void test() throws Exception { + DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(DEFAULT_COLLECTION); + assertEquals(this.sliceCount, docCollection.getSlices().size()); + Slice s = docCollection.getSlice("shard1"); + assertNotNull(s); + assertEquals("Unexpected number of replicas. Collection: " + docCollection, numRealtimeOrAppendReplicas + numPassiveReplicas, s.getReplicas().size()); + assertEquals("Unexpected number of passive replicas. Collection: " + docCollection, numPassiveReplicas, s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); + assertEquals(useAppendReplicas()?0:numRealtimeOrAppendReplicas, s.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); + assertEquals(useAppendReplicas()?numRealtimeOrAppendReplicas:0, s.getReplicas(EnumSet.of(Replica.Type.APPEND)).size()); + handle.clear(); + handle.put("timestamp", SKIPVAL); + + // randomly turn on 1 seconds 'soft' commit + randomlyEnableAutoSoftCommit(); + + tryDelete(); + + List threads = new ArrayList<>(); + int threadCount = 2; + int batchSize = 1; + if (random().nextBoolean()) { + batchSize = random().nextInt(98) + 2; + } + + boolean pauseBetweenUpdates = TEST_NIGHTLY ? random().nextBoolean() : true; + int maxUpdates = -1; + if (!pauseBetweenUpdates) { + maxUpdates = 1000 + random().nextInt(1000); + } else { + maxUpdates = 15000; + } + + for (int i = 0; i < threadCount; i++) { + StoppableIndexingThread indexThread = new StoppableIndexingThread(controlClient, cloudClient, Integer.toString(i), true, maxUpdates, batchSize, pauseBetweenUpdates); // random().nextInt(999) + 1 + threads.add(indexThread); + indexThread.start(); + } + + chaosMonkey.startTheMonkey(false, 500); + try { + long runLength; + if (RUN_LENGTH != -1) { + runLength = RUN_LENGTH; + } else { + int[] runTimes; + if (TEST_NIGHTLY) { + runTimes = new int[] {5000, 6000, 10000, 15000, 25000, 30000, + 30000, 45000, 90000, 120000}; + } else { + runTimes = new int[] {5000, 7000, 15000}; + } + runLength = runTimes[random().nextInt(runTimes.length - 1)]; + } + + Thread.sleep(runLength); + } finally { + chaosMonkey.stopTheMonkey(); + } + + for (StoppableIndexingThread indexThread : threads) { + indexThread.safeStop(); + } + + // wait for stop... + for (StoppableIndexingThread thread : threads) { + thread.join(); + } + + for (StoppableIndexingThread indexThread : threads) { + assertEquals(0, indexThread.getFailCount()); + } + + // try and wait for any replications and what not to finish... + + Thread.sleep(2000); + + waitForThingsToLevelOut(180000); + + // even if things were leveled out, a jetty may have just been stopped or something + // we wait again and wait to level out again to make sure the system is not still in flux + + Thread.sleep(3000); + + waitForThingsToLevelOut(180000); + + checkShardConsistency(batchSize == 1, true); + + log.info("control docs:" + controlClient.query(new SolrQuery("*:*")).getResults().getNumFound() + "\n\n"); + + // try and make a collection to make sure the overseer has survived the expiration and session loss + + // sometimes we restart zookeeper as well + if (random().nextBoolean()) { + zkServer.shutdown(); + zkServer = new ZkTestServer(zkServer.getZkDir(), zkServer.getPort()); + zkServer.run(); + } + + try (CloudSolrClient client = createCloudClient("collection1")) { + createCollection(null, "testcollection", 1, 1, 100, client, null, "conf1"); + + } + List numShardsNumReplicas = new ArrayList<>(2); + numShardsNumReplicas.add(1); + numShardsNumReplicas.add(1 + getPassiveReplicaCount()); + checkForCollection("testcollection",numShardsNumReplicas, null); + } + + private void tryDelete() throws Exception { + long start = System.nanoTime(); + long timeout = start + TimeUnit.NANOSECONDS.convert(10, TimeUnit.SECONDS); + while (System.nanoTime() < timeout) { + try { + del("*:*"); + break; + } catch (SolrServerException e) { + // cluster may not be up yet + e.printStackTrace(); + } + Thread.sleep(100); + } + } + + // skip the randoms - they can deadlock... + @Override + protected void indexr(Object... fields) throws Exception { + SolrInputDocument doc = new SolrInputDocument(); + addFields(doc, fields); + addFields(doc, "rnd_b", true); + indexDoc(doc); + } + +} diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java index cbc3f2aeaa39..7dd147a08fd4 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java @@ -103,8 +103,6 @@ public static void setupCluster() throws Exception { @AfterClass public static void tearDownCluster() throws Exception { -// cluster.shutdown(); -// cluster = null; for (SocketProxy proxy:proxies.values()) { proxy.close(); } diff --git a/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java b/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java index 8c6eb6093dab..9213fee1625c 100644 --- a/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java +++ b/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java @@ -1123,6 +1123,7 @@ protected void setupJettySolrHome(File jettyHome) throws IOException { coreProperties.setProperty("config", "${solrconfig:solrconfig.xml}"); coreProperties.setProperty("schema", "${schema:schema.xml}"); coreProperties.setProperty("coreNodeName", "${coreNodeName:}"); + coreProperties.setProperty("replicaType", "${replicaType:}"); writeCoreProperties(jettyHome.toPath().resolve("cores").resolve("collection1"), coreProperties, "collection1"); diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java index e9e0f3d0292d..acd425b2026f 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java @@ -379,17 +379,22 @@ protected List createJettys(int numJettys) throws Exception { StringBuilder sb = new StringBuilder(); if ("2".equals(getStateFormat())) { - log.info("Creating collection1 with stateFormat=2"); + log.info("Creating " + DEFAULT_COLLECTION + " with stateFormat=2"); SolrZkClient zkClient = new SolrZkClient(zkServer.getZkAddress(), AbstractZkTestCase.TIMEOUT, AbstractZkTestCase.TIMEOUT); Overseer.getStateUpdateQueue(zkClient).offer( Utils.toJSON(Utils.makeMap(Overseer.QUEUE_OPERATION, - CollectionParams.CollectionAction.CREATE.toLower(), "name", - DEFAULT_COLLECTION, "numShards", String.valueOf(sliceCount), + CollectionParams.CollectionAction.CREATE.toLower(), + "name", DEFAULT_COLLECTION, + "numShards", String.valueOf(sliceCount), DocCollection.STATE_FORMAT, getStateFormat(), - ZkStateReader.REALTIME_REPLICAS, useAppendReplicas()))); + ZkStateReader.REALTIME_REPLICAS, useAppendReplicas()?"0":"1", + ZkStateReader.APPEND_REPLICAS, useAppendReplicas()?"1":"0", + ZkStateReader.PASSIVE_REPLICAS, String.valueOf(getPassiveReplicaCount())))); zkClient.close(); } + + int numPassiveReplicas = getPassiveReplicaCount() * sliceCount; for (int i = 1; i <= numJettys; i++) { if (sb.length() > 0) sb.append(','); @@ -399,9 +404,22 @@ DocCollection.STATE_FORMAT, getStateFormat(), jettyDir.mkdirs(); setupJettySolrHome(jettyDir); - log.info("create jetty {} in directory {}", i, jettyDir); - JettySolrRunner j = createJetty(jettyDir, useJettyDataDir ? getDataDir(testDir + "/jetty" - + cnt) : null, null, "solrconfig.xml", null); + JettySolrRunner j; + + if (numPassiveReplicas > 0) { + numPassiveReplicas--; + log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.PASSIVE); + j = createJetty(jettyDir, useJettyDataDir ? getDataDir(testDir + "/jetty" + + cnt) : null, null, "solrconfig.xml", null, Replica.Type.PASSIVE); + } else if (useAppendReplicas()) { + log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.APPEND); + j = createJetty(jettyDir, useJettyDataDir ? getDataDir(testDir + "/jetty" + + cnt) : null, null, "solrconfig.xml", null, Replica.Type.APPEND); + } else { + log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.REALTIME); + j = createJetty(jettyDir, useJettyDataDir ? getDataDir(testDir + "/jetty" + + cnt) : null, null, "solrconfig.xml", null, null); + } jettys.add(j); SolrClient client = createNewSolrClient(j.getLocalPort()); clients.add(client); @@ -410,17 +428,18 @@ DocCollection.STATE_FORMAT, getStateFormat(), this.jettys.addAll(jettys); this.clients.addAll(clients); - int numShards = getTotalReplicas(DEFAULT_COLLECTION); + int numReplicas = getTotalReplicas(DEFAULT_COLLECTION); + int expectedNumReplicas = numJettys; // now wait until we see that the number of shards in the cluster state // matches what we expect int retries = 0; - while (numShards != getShardCount()) { - numShards = getTotalReplicas(DEFAULT_COLLECTION); - if (numShards == getShardCount()) break; + while (numReplicas != expectedNumReplicas) { + numReplicas = getTotalReplicas(DEFAULT_COLLECTION); + if (numReplicas == expectedNumReplicas) break; if (retries++ == 60) { printLayoutOnTearDown = true; - fail("Shards in the state does not match what we set:" + numShards + " vs " + getShardCount()); + fail("Number of replicas in the state does not match what we set:" + numReplicas + " vs " + expectedNumReplicas); } Thread.sleep(500); } @@ -431,7 +450,7 @@ DocCollection.STATE_FORMAT, getStateFormat(), zkStateReader.getLeaderRetry(DEFAULT_COLLECTION, "shard" + i, 10000); } - if (numShards > 0) { + if (numReplicas > 0) { updateMappingsFromZk(this.jettys, this.clients); } @@ -449,6 +468,10 @@ DocCollection.STATE_FORMAT, getStateFormat(), } + protected int getPassiveReplicaCount() { + return 0; + } + /* Total number of replicas (number of cores serving an index to the collection) shown by the cluster state */ protected int getTotalReplicas(String collection) { ZkStateReader zkStateReader = cloudClient.getZkStateReader(); @@ -484,8 +507,12 @@ public JettySolrRunner createJetty(String dataDir, String ulogDir, String shardL return jetty; } - + public JettySolrRunner createJetty(File solrHome, String dataDir, String shardList, String solrConfigOverride, String schemaOverride) throws Exception { + return createJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride, null); + } + + public JettySolrRunner createJetty(File solrHome, String dataDir, String shardList, String solrConfigOverride, String schemaOverride, Replica.Type replicaType) throws Exception { // randomly test a relative solr.home path if (random().nextBoolean()) { solrHome = getRelativeSolrHomePath(solrHome); @@ -508,6 +535,11 @@ public JettySolrRunner createJetty(File solrHome, String dataDir, String shardLi props.setProperty("shards", shardList); if (dataDir != null) props.setProperty("solr.data.dir", getDataDir(dataDir)); + if (replicaType != null) { + props.setProperty("replicaType", replicaType.toString()); + } else { // TODO: include the case with no replicaTYpe defined: if (random().nextBoolean()) { + props.setProperty("replicaType", Replica.Type.REALTIME.toString()); + } props.setProperty("coreRootDirectory", solrHome.toPath().resolve("cores").toAbsolutePath().toString()); JettySolrRunner jetty = new JettySolrRunner(solrHome.getPath(), props, jettyconfig); @@ -1630,6 +1662,7 @@ protected CollectionAdminResponse createCollection(Map> col NUM_SLICES, numShards, ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, ZkStateReader.APPEND_REPLICAS, numAppendReplicas, + ZkStateReader.PASSIVE_REPLICAS, getPassiveReplicaCount(), CREATE_NODE_SET, createNodeSetStr, ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode), client); @@ -1645,6 +1678,7 @@ protected CollectionAdminResponse createCollection(Map> co NUM_SLICES, numShards, ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, ZkStateReader.APPEND_REPLICAS, numAppendReplicas, + ZkStateReader.PASSIVE_REPLICAS, getPassiveReplicaCount(), CREATE_NODE_SET, createNodeSetStr, ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode), client, configName); @@ -1831,6 +1865,7 @@ protected void createCollection(String collName, ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode, ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, ZkStateReader.APPEND_REPLICAS, numAppendReplicas, + ZkStateReader.PASSIVE_REPLICAS, getPassiveReplicaCount(), NUM_SLICES, numShards); Map> collectionInfos = new HashMap<>(); createCollection(collectionInfos, collName, props, client); From 2c133d4cfb533900dcb72784c12b3829e8277c65 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Thu, 27 Apr 2017 16:27:46 -0700 Subject: [PATCH 05/41] Added ChaosMonkey test without safe leader for passive replicas --- .../processor/DistributedUpdateProcessor.java | 5 + ...yNothingIsSafeWithPassiveReplicasTest.java | 431 ++++++++++++++++++ ...nkeySafeLeaderWithPassiveReplicasTest.java | 8 + .../org/apache/solr/cloud/ChaosMonkey.java | 85 +++- 4 files changed, 517 insertions(+), 12 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java index e9f63d507024..41fde185dee8 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java @@ -1885,6 +1885,11 @@ public void processCommit(CommitUpdateCommand cmd) throws IOException { nodes = getCollectionUrls(req, req.getCore().getCoreDescriptor() .getCloudDescriptor().getCollectionName(), EnumSet.of(Replica.Type.APPEND,Replica.Type.REALTIME)); + if (nodes == null) { + // This could happen if there are only passive replicas + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "Unable to distribute commit operation. No replicas available of types " + Replica.Type.APPEND + " or " + Replica.Type.REALTIME); + } if (isLeader && nodes.size() == 1) { singleLeader = true; } diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java new file mode 100644 index 000000000000..9daec5b8f38c --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -0,0 +1,431 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.lang.invoke.MethodHandles; +import java.net.ConnectException; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.http.client.HttpClient; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker; +import org.apache.solr.SolrTestCaseJ4.SuppressSSL; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; +import org.apache.solr.client.solrj.impl.HttpClientUtil; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.util.IOUtils; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Slow +@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") +//@ThreadLeakLingering(linger = 60000) +@SuppressObjectReleaseTracker(bugUrl="Testing purposes") +public class ChaosMonkeyNothingIsSafeWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { + private static final int FAIL_TOLERANCE = 100; + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static final Integer RUN_LENGTH = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.runlength", "-1")); + + private final boolean useAppendReplicas = random().nextBoolean(); + + private final int numPassiveReplicas; + private final int numRealtimeOrAppendReplicas; + + protected int getPassiveReplicaCount() { + return numPassiveReplicas; + } + + @BeforeClass + public static void beforeSuperClass() { + schemaString = "schema15.xml"; // we need a string id + System.setProperty("solr.autoCommit.maxTime", "15000"); + setErrorHook(); + } + + @AfterClass + public static void afterSuperClass() { + System.clearProperty("solr.autoCommit.maxTime"); + clearErrorHook(); + } + + protected static final String[] fieldNames = new String[]{"f_i", "f_f", "f_d", "f_l", "f_dt"}; + protected static final RandVal[] randVals = new RandVal[]{rint, rfloat, rdouble, rlong, rdate}; + + private int clientSoTimeout; + + public String[] getFieldNames() { + return fieldNames; + } + + public RandVal[] getRandValues() { + return randVals; + } + + @Override + public void distribSetUp() throws Exception { + super.distribSetUp(); + // can help to hide this when testing and looking at logs + //ignoreException("shard update error"); + useFactory("solr.StandardDirectoryFactory"); + } + + public ChaosMonkeyNothingIsSafeWithPassiveReplicasTest() { + super(); + numPassiveReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; + numRealtimeOrAppendReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; + sliceCount = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.slicecount", "-1")); + if (sliceCount == -1) { + sliceCount = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1; + } + + int numNodes = sliceCount * (numRealtimeOrAppendReplicas + numPassiveReplicas); + fixShardCount(numNodes); + log.info("Starting ChaosMonkey test with {} shards and {} nodes", sliceCount, numNodes); + + // None of the operations used here are particularly costly, so this should work. + // Using this low timeout will also help us catch index stalling. + clientSoTimeout = 5000; + } + + @Override + protected boolean useAppendReplicas() { + return useAppendReplicas; + } + + @Test + public void test() throws Exception { + cloudClient.setSoTimeout(clientSoTimeout); + DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(DEFAULT_COLLECTION); + assertEquals(this.sliceCount, docCollection.getSlices().size()); + Slice s = docCollection.getSlice("shard1"); + assertNotNull(s); + assertEquals("Unexpected number of replicas. Collection: " + docCollection, numRealtimeOrAppendReplicas + numPassiveReplicas, s.getReplicas().size()); + assertEquals("Unexpected number of passive replicas. Collection: " + docCollection, numPassiveReplicas, s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); + assertEquals(useAppendReplicas()?0:numRealtimeOrAppendReplicas, s.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); + assertEquals(useAppendReplicas()?numRealtimeOrAppendReplicas:0, s.getReplicas(EnumSet.of(Replica.Type.APPEND)).size()); + + boolean testSuccessful = false; + try { + handle.clear(); + handle.put("timestamp", SKIPVAL); + ZkStateReader zkStateReader = cloudClient.getZkStateReader(); + // make sure we have leaders for each shard + for (int j = 1; j < sliceCount; j++) { + zkStateReader.getLeaderRetry(DEFAULT_COLLECTION, "shard" + j, 10000); + } // make sure we again have leaders for each shard + + waitForRecoveriesToFinish(false); + + // we cannot do delete by query + // as it's not supported for recovery + del("*:*"); + + List threads = new ArrayList<>(); + List indexTreads = new ArrayList<>(); + int threadCount = TEST_NIGHTLY ? 3 : 1; + int i = 0; + for (i = 0; i < threadCount; i++) { + StoppableIndexingThread indexThread = new StoppableIndexingThread(controlClient, cloudClient, Integer.toString(i), true); + threads.add(indexThread); + indexTreads.add(indexThread); + indexThread.start(); + } + + threadCount = 1; + i = 0; + for (i = 0; i < threadCount; i++) { + StoppableSearchThread searchThread = new StoppableSearchThread(cloudClient); + threads.add(searchThread); + searchThread.start(); + } + + // TODO: we only do this sometimes so that we can sometimes compare against control, + // it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer + boolean runFullThrottle = random().nextBoolean(); + if (runFullThrottle) { + FullThrottleStoppableIndexingThread ftIndexThread = new FullThrottleStoppableIndexingThread( + clients, "ft1", true); + threads.add(ftIndexThread); + ftIndexThread.start(); + } + + chaosMonkey.startTheMonkey(true, 10000); + try { + long runLength; + if (RUN_LENGTH != -1) { + runLength = RUN_LENGTH; + } else { + int[] runTimes; + if (TEST_NIGHTLY) { + runTimes = new int[] {5000, 6000, 10000, 15000, 25000, 30000, + 30000, 45000, 90000, 120000}; + } else { + runTimes = new int[] {5000, 7000, 15000}; + } + runLength = runTimes[random().nextInt(runTimes.length - 1)]; + } + + Thread.sleep(runLength); + } finally { + chaosMonkey.stopTheMonkey(); + } + + // ideally this should go into chaosMonkey + restartZk(1000 * (5 + random().nextInt(4))); + + for (StoppableThread indexThread : threads) { + indexThread.safeStop(); + } + + // start any downed jetties to be sure we still will end up with a leader per shard... + + // wait for stop... + for (StoppableThread indexThread : threads) { + indexThread.join(); + } + + // try and wait for any replications and what not to finish... + + Thread.sleep(2000); + + // wait until there are no recoveries... + waitForThingsToLevelOut(Integer.MAX_VALUE);//Math.round((runLength / 1000.0f / 3.0f))); + + // make sure we again have leaders for each shard + for (int j = 1; j < sliceCount; j++) { + zkStateReader.getLeaderRetry(DEFAULT_COLLECTION, "shard" + j, 30000); + } + + commit(); + + // TODO: assert we didnt kill everyone + + zkStateReader.updateLiveNodes(); + assertTrue(zkStateReader.getClusterState().getLiveNodes().size() > 0); + + + // we expect full throttle fails, but cloud client should not easily fail + for (StoppableThread indexThread : threads) { + if (indexThread instanceof StoppableIndexingThread && !(indexThread instanceof FullThrottleStoppableIndexingThread)) { + int failCount = ((StoppableIndexingThread) indexThread).getFailCount(); + assertFalse("There were too many update fails (" + failCount + " > " + FAIL_TOLERANCE + + ") - we expect it can happen, but shouldn't easily", failCount > FAIL_TOLERANCE); + } + } + + + Set addFails = getAddFails(indexTreads); + Set deleteFails = getDeleteFails(indexTreads); + // full throttle thread can + // have request fails + checkShardConsistency(!runFullThrottle, true, addFails, deleteFails); + + long ctrlDocs = controlClient.query(new SolrQuery("*:*")).getResults() + .getNumFound(); + + // ensure we have added more than 0 docs + long cloudClientDocs = cloudClient.query(new SolrQuery("*:*")) + .getResults().getNumFound(); + + assertTrue("Found " + ctrlDocs + " control docs", cloudClientDocs > 0); + + if (VERBOSE) System.out.println("control docs:" + + controlClient.query(new SolrQuery("*:*")).getResults() + .getNumFound() + "\n\n"); + + // try and make a collection to make sure the overseer has survived the expiration and session loss + + // sometimes we restart zookeeper as well + if (random().nextBoolean()) { + restartZk(1000 * (5 + random().nextInt(4))); + } + + try (CloudSolrClient client = createCloudClient("collection1")) { + createCollection(null, "testcollection", + 1, 1, 1, client, null, "conf1"); + + } + List numShardsNumReplicas = new ArrayList<>(2); + numShardsNumReplicas.add(1); + numShardsNumReplicas.add(1 + getPassiveReplicaCount()); + checkForCollection("testcollection", numShardsNumReplicas, null); + + testSuccessful = true; + } finally { + if (!testSuccessful) { + printLayout(); + } + } + } + + private Set getAddFails(List threads) { + Set addFails = new HashSet(); + for (StoppableIndexingThread thread : threads) { + addFails.addAll(thread.getAddFails()); + } + return addFails; + } + + private Set getDeleteFails(List threads) { + Set deleteFails = new HashSet(); + for (StoppableIndexingThread thread : threads) { + deleteFails.addAll(thread.getDeleteFails()); + } + return deleteFails; + } + + class FullThrottleStoppableIndexingThread extends StoppableIndexingThread { + private CloseableHttpClient httpClient = HttpClientUtil.createClient(null); + private volatile boolean stop = false; + int clientIndex = 0; + private ConcurrentUpdateSolrClient cusc; + private List clients; + private AtomicInteger fails = new AtomicInteger(); + + public FullThrottleStoppableIndexingThread(List clients, + String id, boolean doDeletes) { + super(controlClient, cloudClient, id, doDeletes); + setName("FullThrottleStopableIndexingThread"); + setDaemon(true); + this.clients = clients; + + cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(0)).getBaseURL(), httpClient, 8, 2); + cusc.setConnectionTimeout(10000); + cusc.setSoTimeout(clientSoTimeout); + } + + @Override + public void run() { + int i = 0; + int numDeletes = 0; + int numAdds = 0; + + while (true && !stop) { + String id = this.id + "-" + i; + ++i; + + if (doDeletes && random().nextBoolean() && deletes.size() > 0) { + String delete = deletes.remove(0); + try { + numDeletes++; + cusc.deleteById(delete); + } catch (Exception e) { + changeUrlOnError(e); + fails.incrementAndGet(); + } + } + + try { + numAdds++; + if (numAdds > (TEST_NIGHTLY ? 4002 : 197)) + continue; + SolrInputDocument doc = getDoc( + "id", + id, + i1, + 50, + t1, + "Saxon heptarchies that used to rip around so in old times and raise Cain. My, you ought to seen old Henry the Eight when he was in bloom. He WAS a blossom. He used to marry a new wife every day, and chop off her head next morning. And he would do it just as indifferent as if "); + cusc.add(doc); + } catch (Exception e) { + changeUrlOnError(e); + fails.incrementAndGet(); + } + + if (doDeletes && random().nextBoolean()) { + deletes.add(id); + } + + } + + log.info("FT added docs:" + numAdds + " with " + fails + " fails" + " deletes:" + numDeletes); + } + + private void changeUrlOnError(Exception e) { + if (e instanceof ConnectException) { + clientIndex++; + if (clientIndex > clients.size() - 1) { + clientIndex = 0; + } + cusc.shutdownNow(); + cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(clientIndex)).getBaseURL(), + httpClient, 30, 3); + } + } + + @Override + public void safeStop() { + stop = true; + cusc.blockUntilFinished(); + cusc.shutdownNow(); + IOUtils.closeQuietly(httpClient); + } + + @Override + public int getFailCount() { + return fails.get(); + } + + @Override + public Set getAddFails() { + throw new UnsupportedOperationException(); + } + + @Override + public Set getDeleteFails() { + throw new UnsupportedOperationException(); + } + + }; + + + // skip the randoms - they can deadlock... + @Override + protected void indexr(Object... fields) throws Exception { + SolrInputDocument doc = getDoc(fields); + indexDoc(doc); + } + + static class ErrorLoggingConcurrentUpdateSolrClient extends ConcurrentUpdateSolrClient { + public ErrorLoggingConcurrentUpdateSolrClient(String serverUrl, HttpClient httpClient, int queueSize, int threadCount) { + super(serverUrl, httpClient, queueSize, threadCount, null, false); + } + @Override + public void handleError(Throwable ex) { + log.warn("cusc error", ex); + } + } +} diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java index 5fee6049f8b7..46dc837d84c5 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -42,12 +42,20 @@ public class ChaosMonkeySafeLeaderWithPassiveReplicasTest extends AbstractFullDi private static final Integer RUN_LENGTH = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.runlength", "-1")); + private final boolean useAppendReplicas = random().nextBoolean(); + private final int numPassiveReplicas; private final int numRealtimeOrAppendReplicas; + @Override protected int getPassiveReplicaCount() { return numPassiveReplicas; } + + @Override + protected boolean useAppendReplicas() { + return useAppendReplicas; + } @BeforeClass public static void beforeSuperClass() { diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index 2ad42d164616..fd2d15b2fd31 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -29,7 +29,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.embedded.JettySolrRunner; import org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner; +import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Replica.Type; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkNodeProps; @@ -369,16 +371,32 @@ public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLea return null; } + boolean canKillIndexer = canKillIndexer(slice); + + if (!canKillIndexer) { + monkeyLog("Number of indexer nodes (realtime or append) is not enough to kill one of them, Will only choose a passive replica to kill"); + } + int chance = chaosRandom.nextInt(10); - CloudJettyRunner cjetty; - if (chance <= 5 && aggressivelyKillLeaders) { + CloudJettyRunner cjetty = null; + if (chance <= 5 && aggressivelyKillLeaders && canKillIndexer) { // if killLeader, really aggressively go after leaders cjetty = shardToLeaderJetty.get(slice); } else { - // get random shard List jetties = shardToJetty.get(slice); - int index = chaosRandom.nextInt(jetties.size()); - cjetty = jetties.get(index); + // get random node + int attempt = 0; + while (true) { + attempt++; + int index = chaosRandom.nextInt(jetties.size()); + cjetty = jetties.get(index); + if (canKillIndexer || getTypeForJetty(slice, cjetty) == Replica.Type.PASSIVE) { + break; + } else if (attempt > 20) { + monkeyLog("Can't kill indexer nodes (realtime or append) and couldn't find a random passive node after 20 attempts - monkey cannot kill :("); + return null; + } + } ZkNodeProps leader = null; try { @@ -403,7 +421,7 @@ public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLea return null; } - boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(jetties.get(index).nodeName) + boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(cjetty.nodeName) || rtIsLeader; if (!aggressivelyKillLeaders && isLeader) { // we don't kill leaders... @@ -424,18 +442,61 @@ public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLea return cjetty; } - private int checkIfKillIsLegal(String slice, int numActive) throws KeeperException, InterruptedException { - for (CloudJettyRunner cloudJetty : shardToJetty.get(slice)) { + private Type getTypeForJetty(String sliceName, CloudJettyRunner cjetty) { + DocCollection docCollection = zkStateReader.getClusterState().getCollection(collection); + + Slice slice = docCollection.getSlice(sliceName); + + ZkNodeProps props = slice.getReplicasMap().get(cjetty.coreNodeName); + if (props == null) { + throw new RuntimeException("shard name " + cjetty.coreNodeName + " not found in " + slice.getReplicasMap().keySet()); + } + return Replica.Type.valueOf(props.getStr(ZkStateReader.REPLICA_TYPE)); + } + + private boolean canKillIndexer(String sliceName) throws KeeperException, InterruptedException { + int numIndexersFoundInShard = 0; + for (CloudJettyRunner cloudJetty : shardToJetty.get(sliceName)) { // get latest cloud state zkStateReader.forceUpdateCollection(collection); - Slice theShards = zkStateReader.getClusterState().getSlicesMap(collection) - .get(slice); + DocCollection docCollection = zkStateReader.getClusterState().getCollection(collection); + + Slice slice = docCollection.getSlice(sliceName); + + ZkNodeProps props = slice.getReplicasMap().get(cloudJetty.coreNodeName); + if (props == null) { + throw new RuntimeException("shard name " + cloudJetty.coreNodeName + " not found in " + slice.getReplicasMap().keySet()); + } + + final Replica.State state = Replica.State.getState(props.getStr(ZkStateReader.STATE_PROP)); + final Replica.Type replicaType = Replica.Type.valueOf(props.getStr(ZkStateReader.REPLICA_TYPE)); + final String nodeName = props.getStr(ZkStateReader.NODE_NAME_PROP); + + if (cloudJetty.jetty.isRunning() + && state == Replica.State.ACTIVE + && (replicaType == Replica.Type.APPEND || replicaType == Replica.Type.REALTIME) + && zkStateReader.getClusterState().liveNodesContain(nodeName)) { + numIndexersFoundInShard++; + } + } + return numIndexersFoundInShard > 1; + } + + private int checkIfKillIsLegal(String sliceName, int numActive) throws KeeperException, InterruptedException { + for (CloudJettyRunner cloudJetty : shardToJetty.get(sliceName)) { + + // get latest cloud state + zkStateReader.forceUpdateCollection(collection); + + DocCollection docCollection = zkStateReader.getClusterState().getCollection(collection); + + Slice slice = docCollection.getSlice(sliceName); - ZkNodeProps props = theShards.getReplicasMap().get(cloudJetty.coreNodeName); + ZkNodeProps props = slice.getReplicasMap().get(cloudJetty.coreNodeName); if (props == null) { - throw new RuntimeException("shard name " + cloudJetty.coreNodeName + " not found in " + theShards.getReplicasMap().keySet()); + throw new RuntimeException("shard name " + cloudJetty.coreNodeName + " not found in " + slice.getReplicasMap().keySet()); } final Replica.State state = Replica.State.getState(props.getStr(ZkStateReader.STATE_PROP)); From a342edd9eee95c30eabd00824a7c69f1d36ba33a Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Thu, 27 Apr 2017 16:38:24 -0700 Subject: [PATCH 06/41] Fix ChaosMonkey expire connection and connection loss properties --- .../src/java/org/apache/solr/cloud/ChaosMonkey.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index fd2d15b2fd31..f42831bc90f9 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -63,8 +63,8 @@ public class ChaosMonkey { private Map> shardToJetty; private static final Boolean MONKEY_ENABLED = Boolean.valueOf(System.getProperty("solr.tests.cloud.cm.enabled", "true")); - private static final Boolean CONN_LOSS = Boolean.valueOf(System.getProperty("solr.tests.cloud.cm.connloss", null)); - private static final Boolean EXP = Boolean.valueOf(System.getProperty("solr.tests.cloud.cm.exp", null)); + private static final String CONN_LOSS = System.getProperty("solr.tests.cloud.cm.connloss"); + private static final String EXP = System.getProperty("solr.tests.cloud.cm.exp"); private ZkTestServer zkServer; private ZkStateReader zkStateReader; @@ -108,12 +108,12 @@ public ChaosMonkey(ZkTestServer zkServer, ZkStateReader zkStateReader, } if (EXP != null) { - expireSessions = EXP; + expireSessions = Boolean.parseBoolean(EXP); } else { expireSessions = chaosRandom.nextBoolean(); } if (CONN_LOSS != null) { - causeConnectionLoss = CONN_LOSS; + causeConnectionLoss = Boolean.parseBoolean(CONN_LOSS); } else { causeConnectionLoss = chaosRandom.nextBoolean(); } From e7d54fa0b1e31b01be05c479975da36c53259a96 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Thu, 27 Apr 2017 17:51:52 -0700 Subject: [PATCH 07/41] Added logging to ChaosMonkey --- ...yNothingIsSafeWithPassiveReplicasTest.java | 7 ++- ...nkeySafeLeaderWithPassiveReplicasTest.java | 2 +- .../org/apache/solr/cloud/ChaosMonkey.java | 46 +++++++++++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index 9daec5b8f38c..e351541e395b 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -197,8 +197,7 @@ public void test() throws Exception { } runLength = runTimes[random().nextInt(runTimes.length - 1)]; } - - Thread.sleep(runLength); + ChaosMonkey.wait(runLength, DEFAULT_COLLECTION, zkStateReader); } finally { chaosMonkey.stopTheMonkey(); } @@ -291,6 +290,10 @@ public void test() throws Exception { } } + private void logCollectionStateSummary(String defaultCollection) { + + } + private Set getAddFails(List threads) { Set addFails = new HashSet(); for (StoppableIndexingThread thread : threads) { diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java index 46dc837d84c5..687e7a943264 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -156,7 +156,7 @@ public void test() throws Exception { runLength = runTimes[random().nextInt(runTimes.length - 1)]; } - Thread.sleep(runLength); + ChaosMonkey.wait(runLength, DEFAULT_COLLECTION, cloudClient.getZkStateReader()); } finally { chaosMonkey.stopTheMonkey(); } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index f42831bc90f9..c1f29dbff1de 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -21,10 +21,13 @@ import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Pattern; import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.embedded.JettySolrRunner; @@ -41,6 +44,7 @@ import org.apache.solr.servlet.SolrDispatchFilter; import org.apache.solr.update.DirectUpdateHandler2; import org.apache.solr.util.RTimer; +import org.apache.solr.util.TimeOut; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -562,6 +566,10 @@ public static void monkeyLog(String msg) { log.info("monkey: " + msg); } + public static void monkeyLog(String msg, Object...logParams) { + log.info("monkey: " + msg, logParams); + } + public void stopTheMonkey() { stop = true; try { @@ -680,4 +688,42 @@ public static boolean start(JettySolrRunner jetty) throws Exception { return true; } + public static void wait(long runLength, String collectionName, ZkStateReader zkStateReader) throws InterruptedException { + TimeOut t = new TimeOut(runLength, TimeUnit.MILLISECONDS); + while (!t.hasTimedOut()) { + Thread.sleep(Math.min(1000, runLength)); + logCollectionStateSummary(collectionName, zkStateReader); + } + } + + private static void logCollectionStateSummary(String collectionName, ZkStateReader zkStateReader) { + Pattern portPattern = Pattern.compile(".*:([0-9]*).*"); + DocCollection docCollection = zkStateReader.getClusterState().getCollection(collectionName); + if (docCollection == null) { + monkeyLog("Could not find collection {}", collectionName); + } + StringBuilder builder = new StringBuilder(); + builder.append("Collection status: {"); + for (Slice slice:docCollection.getSlices()) { + builder.append(slice.getName() + ": {"); + for (Replica replica:slice.getReplicas()) { + log.info(replica.toString()); + java.util.regex.Matcher m = portPattern.matcher(replica.getBaseUrl()); + m.find(); + String jettyPort = m.group(1); + builder.append(String.format(Locale.ROOT, "%s(%s): {state: %s, type: %s, leader: %s, Live: %s}, ", + replica.getName(), jettyPort, replica.getState(), replica.getType(), (replica.get("leader")!= null), zkStateReader.getClusterState().liveNodesContain(replica.getNodeName()))); + } + if (slice.getReplicas().size() > 0) { + builder.setLength(builder.length() - 2); + } + builder.append("}, "); + } + if (docCollection.getSlices().size() > 0) { + builder.setLength(builder.length() - 2); + } + builder.append("}"); + monkeyLog(builder.toString()); + } + } From 0f9baa4919840e406122bba4ef87897121be0649 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Fri, 28 Apr 2017 14:26:26 -0700 Subject: [PATCH 08/41] Minor improvements to ChaosMonkey tests --- ...yNothingIsSafeWithPassiveReplicasTest.java | 61 ++++++++++++++++- ...nkeySafeLeaderWithPassiveReplicasTest.java | 65 ++++++++++++++++++- .../org/apache/solr/cloud/ChaosMonkey.java | 8 +-- 3 files changed, 125 insertions(+), 9 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index e351541e395b..782e3dd2f8fb 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -16,13 +16,16 @@ */ package org.apache.solr.cloud; +import java.io.IOException; import java.lang.invoke.MethodHandles; import java.net.ConnectException; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import org.apache.http.client.HttpClient; @@ -32,25 +35,34 @@ import org.apache.solr.SolrTestCaseJ4.SuppressSSL; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; import org.apache.solr.client.solrj.impl.HttpClientUtil; import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.IOUtils; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.ReplicationHandler; +import org.apache.solr.util.TimeOut; +import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; + @Slow @SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") -//@ThreadLeakLingering(linger = 60000) +@ThreadLeakLingering(linger = 60000) @SuppressObjectReleaseTracker(bugUrl="Testing purposes") public class ChaosMonkeyNothingIsSafeWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { private static final int FAIL_TOLERANCE = 100; @@ -245,6 +257,7 @@ public void test() throws Exception { } } + waitForReplicationFromPassiveReplicas(DEFAULT_COLLECTION, zkStateReader, new TimeOut(30, TimeUnit.SECONDS)); Set addFails = getAddFails(indexTreads); Set deleteFails = getDeleteFails(indexTreads); @@ -290,8 +303,50 @@ public void test() throws Exception { } } - private void logCollectionStateSummary(String defaultCollection) { - + private void waitForReplicationFromPassiveReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException { + zkStateReader.forceUpdateCollection(collectionName); + DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); + for(Slice s:collection.getSlices()) { + Replica leader = s.getLeader(); + long leaderIndexVersion = -1; + while (leaderIndexVersion == -1 && !timeout.hasTimedOut()) { + leaderIndexVersion = getIndexVersion(leader); + Thread.sleep(1000); + } + for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE))) { + if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { + continue; + } + while (true) { + long replicaIndexVersion = getIndexVersion(passiveReplica); + if (leaderIndexVersion > replicaIndexVersion) { + if (timeout.hasTimedOut()) { + fail(String.format(Locale.ROOT, "Timed out waiting for replica %s (%d) to replicate from leader %s (%d)", passiveReplica.getName(), replicaIndexVersion, leader.getName(), leaderIndexVersion)); + } + log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); + Thread.sleep(1000); + } else { + break; + } + } + } + } + } + + @SuppressWarnings("unchecked") + private long getIndexVersion(Replica replica) throws IOException { + try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set("qt", "/replication"); + params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_SHOW_COMMITS); + try { + QueryResponse response = client.query(params); + return (Long)((List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS)).get(0).get("indexVersion"); + } catch (SolrServerException e) { + log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); + return -1; + } + } } private Set getAddFails(List threads) { diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java index 687e7a943264..f3b14810c9d7 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -16,20 +16,31 @@ */ package org.apache.solr.cloud; +import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; +import java.util.Locale; import java.util.concurrent.TimeUnit; import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.ReplicationHandler; +import org.apache.solr.util.TimeOut; +import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -37,6 +48,7 @@ import org.slf4j.LoggerFactory; @Slow +@SuppressObjectReleaseTracker(bugUrl="Testing purposes") public class ChaosMonkeySafeLeaderWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -186,11 +198,13 @@ public void test() throws Exception { Thread.sleep(3000); waitForThingsToLevelOut(180000); - - checkShardConsistency(batchSize == 1, true); log.info("control docs:" + controlClient.query(new SolrQuery("*:*")).getResults().getNumFound() + "\n\n"); + waitForReplicationFromPassiveReplicas(DEFAULT_COLLECTION, cloudClient.getZkStateReader(), new TimeOut(30, TimeUnit.SECONDS)); + + checkShardConsistency(batchSize == 1, true); + // try and make a collection to make sure the overseer has survived the expiration and session loss // sometimes we restart zookeeper as well @@ -225,6 +239,53 @@ private void tryDelete() throws Exception { } } + private void waitForReplicationFromPassiveReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException { + zkStateReader.forceUpdateCollection(collectionName); + DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); + for(Slice s:collection.getSlices()) { + Replica leader = s.getLeader(); + long leaderIndexVersion = -1; + while (leaderIndexVersion == -1 && !timeout.hasTimedOut()) { + leaderIndexVersion = getIndexVersion(leader); + Thread.sleep(1000); + } + for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE))) { + if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { + continue; + } + while (true) { + long replicaIndexVersion = getIndexVersion(passiveReplica); + if (leaderIndexVersion > replicaIndexVersion) { + if (timeout.hasTimedOut()) { + fail(String.format(Locale.ROOT, "Timed out waiting for replica %s to replicate from leader %s", passiveReplica.getName(), leader.getName())); + } + log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); + Thread.sleep(1000); + } else { + break; + } + } + } + } + } + + @SuppressWarnings("unchecked") + private long getIndexVersion(Replica replica) throws IOException { + try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set("qt", "/replication"); + params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_SHOW_COMMITS); + try { + QueryResponse response = client.query(params); + return (Long)((List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS)).get(0).get("indexVersion"); + } catch (SolrServerException e) { + log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); + return -1; + } + } + } + + // skip the randoms - they can deadlock... @Override protected void indexr(Object... fields) throws Exception { diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index c1f29dbff1de..b9080c5a1dee 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -129,10 +129,9 @@ public ChaosMonkey(ZkTestServer zkServer, ZkStateReader zkStateReader, // TODO: expire all clients at once? public void expireSession(final JettySolrRunner jetty) { - monkeyLog("expire session for " + jetty.getLocalPort() + " !"); - CoreContainer cores = jetty.getCoreContainer(); if (cores != null) { + monkeyLog("expire session for " + jetty.getLocalPort() + " !"); causeConnectionLoss(jetty); long sessionId = cores.getZkController().getZkClient() .getSolrZooKeeper().getSessionId(); @@ -152,7 +151,7 @@ public void expireRandomSession() throws KeeperException, InterruptedException { } public void randomConnectionLoss() throws KeeperException, InterruptedException { - monkeyLog("cause connection loss!"); + monkeyLog("Will cause connection loss!"); String sliceName = getRandomSlice(); CloudJettyRunner jetty = getRandomJetty(sliceName, aggressivelyKillLeaders); @@ -165,6 +164,7 @@ public void randomConnectionLoss() throws KeeperException, InterruptedException public static void causeConnectionLoss(JettySolrRunner jetty) { CoreContainer cores = jetty.getCoreContainer(); if (cores != null) { + monkeyLog("Will cause connection loss on " + jetty.getLocalPort()); SolrZkClient zkClient = cores.getZkController().getZkClient(); zkClient.getSolrZooKeeper().closeCnxn(); } @@ -648,7 +648,7 @@ public static void start(List jettys) throws Exception { } public static boolean start(JettySolrRunner jetty) throws Exception { - + monkeyLog("starting jetty! " + jetty.getLocalPort()); IpTables.unblockPort(jetty.getLocalPort()); try { jetty.start(); From e7c8cec61c5b27bd9ce40eaa29a2f621a0bf2640 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Fri, 28 Apr 2017 15:53:12 -0700 Subject: [PATCH 09/41] Some code cleanup --- .../org/apache/solr/cloud/CloudDescriptor.java | 1 - .../apache/solr/cloud/CreateCollectionCmd.java | 14 ++++++-------- .../cloud/OverseerCollectionMessageHandler.java | 4 +++- .../org/apache/solr/cloud/RecoveryStrategy.java | 2 +- .../org/apache/solr/cloud/SplitShardCmd.java | 2 +- .../org/apache/solr/cloud/ZkController.java | 17 ++++++++--------- .../solr/cloud/overseer/ReplicaMutator.java | 4 ++-- .../solr/cloud/overseer/SliceMutator.java | 6 ++---- .../solr/cloud/overseer/ZkStateWriter.java | 2 +- .../org/apache/solr/core/CoreContainer.java | 3 ++- .../solr/handler/admin/CoreAdminHandler.java | 2 -- .../handler/component/RealTimeGetComponent.java | 1 + .../solr/update/DefaultSolrCoreState.java | 10 +++------- .../org/apache/solr/update/UpdateCommand.java | 1 - .../processor/DistributedUpdateProcessor.java | 1 - .../org/apache/solr/cloud/OverseerTest.java | 4 ++-- .../org/apache/solr/common/cloud/Slice.java | 2 +- .../apache/solr/common/cloud/ZkStateReader.java | 2 +- .../solr/common/params/CoreAdminParams.java | 2 +- .../solr/cloud/AbstractDistribZkTestBase.java | 3 --- 20 files changed, 35 insertions(+), 48 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java index c39272c42f61..1f387ad9b4ce 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java +++ b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java @@ -64,7 +64,6 @@ public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { if (Strings.isNullOrEmpty(nodeName)) this.nodeName = null; this.numShards = PropertiesUtil.toInteger(props.getProperty(CloudDescriptor.NUM_SHARDS), null); - System.out.println("ReplicaType: " + props); this.replicaType = Replica.Type.valueOf(props.getProperty(CloudDescriptor.REPLICA_TYPE, Replica.Type.REALTIME.toString())); for (String propName : props.stringPropertyNames()) { if (propName.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) { diff --git a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java index 4b6971ee4659..2cc331096712 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java @@ -136,20 +136,19 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul positionVsNodes = new HashMap<>(); } else { - if (numRealtimeReplicas > nodeList.size()) { - log.warn("Specified " - + REALTIME_REPLICAS - + " of " - + numRealtimeReplicas + int totalNumReplicas = numRealtimeReplicas + numAppendReplicas + numPassiveReplicas; + if (totalNumReplicas > nodeList.size()) { + log.warn("Specified number of replicas of " + + totalNumReplicas + " on collection " + collectionName - + " is higher than or equal to the number of Solr instances currently live or live and part of your " + CREATE_NODE_SET + "(" + + " is higher than the number of Solr instances currently live or live and part of your " + CREATE_NODE_SET + "(" + nodeList.size() + "). It's unusual to run two replica of the same slice on the same Solr-instance."); } int maxShardsAllowedToCreate = maxShardsPerNode * nodeList.size(); - int requestedShardsToCreate = numSlices * (numRealtimeReplicas + numPassiveReplicas + numAppendReplicas); + int requestedShardsToCreate = numSlices * totalNumReplicas; if (maxShardsAllowedToCreate < requestedShardsToCreate) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Cannot create collection " + collectionName + ". Value of " + MAX_SHARDS_PER_NODE + " is " + maxShardsPerNode @@ -239,7 +238,6 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul params.set(CoreAdminParams.SHARD, position.shard); params.set(ZkStateReader.NUM_SHARDS_PROP, numSlices); params.set(CoreAdminParams.NEW_COLLECTION, "true"); - // This is used to tell the CoreAdminHandler that the new core doesn't need a tlog in case of passive replicas params.set(CoreAdminParams.REPLICA_TYPE, position.type.name()); if (async != null) { diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java index 4cae7856e479..5b1238081ead 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java @@ -156,10 +156,12 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler static final String SKIP_CREATE_REPLICA_IN_CLUSTER_STATE = "skipCreateReplicaInClusterState"; - //nocommit: review public static final Map COLL_PROPS = Collections.unmodifiableMap(makeMap( ROUTER, DocRouter.DEFAULT_NAME, ZkStateReader.REPLICATION_FACTOR, "1", + ZkStateReader.REALTIME_REPLICAS, "1", + ZkStateReader.APPEND_REPLICAS, "0", + ZkStateReader.PASSIVE_REPLICAS, "0", ZkStateReader.MAX_SHARDS_PER_NODE, "1", ZkStateReader.AUTO_ADD_REPLICAS, "false", DocCollection.RULE, null, diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index 5a952ea0b9d7..53b6f3e2b57f 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -262,7 +262,7 @@ final private void commitOnLeader(String leaderUrl) throws SolrServerException, ureq.setParams(new ModifiableSolrParams()); ureq.getParams().set(DistributedUpdateProcessor.COMMIT_END_POINT, true); // ureq.getParams().set(UpdateParams.OPEN_SEARCHER, onlyLeaderIndexes);// nocommit: Why do we need to open searcher if "onlyLeaderIndexes"? - ureq.getParams().set(UpdateParams.OPEN_SEARCHER, true); + ureq.getParams().set(UpdateParams.OPEN_SEARCHER, false); ureq.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, true).process( client); } diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java index 837af7985f09..b9494edd4995 100644 --- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java @@ -205,7 +205,7 @@ public boolean split(ClusterState clusterState, ZkNodeProps message, NamedList r for (int i = 0; i < subRanges.size(); i++) { String subSlice = slice + "_" + i; subSlices.add(subSlice); - String subShardName = collectionName + "_" + subSlice + "_replica1"; + String subShardName = Assign.buildCoreName(collectionName, subSlice, Replica.Type.REALTIME, 1); subShardNames.add(subShardName); } diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index b239dd2445dc..e4e98da293fd 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -892,7 +892,7 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov if (replica == null || replica.getType() != Type.PASSIVE) { joinElection(desc, afterExpiration, joinAtHead); } else if (replica.getType() == Type.PASSIVE) { - log.debug("Replica {} skipping election because replica is passive", coreZkNodeName); + log.debug("Replica {} skipping election because it's type is {}", coreZkNodeName, Type.PASSIVE); startReplicationFromLeader(coreName, false); } } catch (InterruptedException e) { @@ -923,15 +923,15 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov // leader election perhaps? UpdateLog ulog = core.getUpdateHandler().getUpdateLog(); - boolean isReplicaInOnlyLeaderIndexes = replicaType == Replica.Type.APPEND && !isLeader; - if (isReplicaInOnlyLeaderIndexes) { + boolean isAppendAndNotLeader = replicaType == Replica.Type.APPEND && !isLeader; + if (isAppendAndNotLeader) { String commitVersion = ReplicateFromLeader.getCommitVersion(core); if (commitVersion != null) { ulog.copyOverOldUpdates(Long.parseLong(commitVersion)); } } // we will call register again after zk expiration and on reload - if (!afterExpiration && !core.isReloaded() && ulog != null && !isReplicaInOnlyLeaderIndexes) { + if (!afterExpiration && !core.isReloaded() && ulog != null && !isAppendAndNotLeader) { // disable recovery in case shard is in construction state (for shard splits) Slice slice = getClusterState().getSlice(collection, shardId); if (slice.getState() != Slice.State.CONSTRUCTION || !isLeader) { @@ -950,7 +950,7 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov boolean didRecovery = checkRecovery(recoverReloadedCores, isLeader, skipRecovery, collection, coreZkNodeName, core, cc, afterExpiration); if (!didRecovery) { - if (isReplicaInOnlyLeaderIndexes) { + if (isAppendAndNotLeader) { startReplicationFromLeader(coreName, true); } publish(desc, Replica.State.ACTIVE); @@ -969,7 +969,7 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov } public void startReplicationFromLeader(String coreName, boolean switchTransactionLog) throws InterruptedException { - log.info(coreName + " starting replication from leader"); + log.info("{} starting replication from leader", coreName); ReplicateFromLeader replicateFromLeader = new ReplicateFromLeader(cc, coreName); if (replicateFromLeaders.putIfAbsent(coreName, replicateFromLeader) == null) { replicateFromLeader.startReplication(switchTransactionLog); @@ -979,7 +979,7 @@ public void startReplicationFromLeader(String coreName, boolean switchTransactio } public void stopReplicationFromLeader(String coreName) { - log.info(coreName + " stopping replication from leader"); + log.info("{} stopping replication from leader", coreName); ReplicateFromLeader replicateFromLeader = replicateFromLeaders.remove(coreName); if (replicateFromLeader != null) { replicateFromLeader.stopReplication(); @@ -1203,7 +1203,7 @@ public void publish(final CoreDescriptor cd, final Replica.State state, boolean if (state != Replica.State.DOWN) { final Replica.State lirState = getLeaderInitiatedRecoveryState(collection, shardId, coreNodeName); if (lirState != null) { - assert cd.getCloudDescriptor().getReplicaType() != Replica.Type.PASSIVE; + assert cd.getCloudDescriptor().getReplicaType() != Replica.Type.PASSIVE: "LIR should not happen for passive replicas!"; if (state == Replica.State.ACTIVE) { // trying to become active, so leader-initiated state must be recovering if (lirState == Replica.State.RECOVERING) { @@ -1378,7 +1378,6 @@ private void waitForShardId(CoreDescriptor cd) { final String shardId = zkStateReader.getClusterState().getShardId(cd.getCollectionName(), getNodeName(), cd.getName()); if (shardId != null) { cd.getCloudDescriptor().setShardId(shardId); - log.debug("Shard ID is {} for core {} ", shardId, cd.getName()); return; } try { diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java index c467405c9e59..9758c8f09226 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java @@ -313,7 +313,7 @@ private ZkWriteCommand updateState(final ClusterState prevState, ZkNodeProps mes Replica replica = new Replica(coreNodeName, replicaProps); - log.debug("Will update state for replica: " + replica); + log.debug("Will update state for replica: {}", replica); Map sliceProps = null; Map replicas; @@ -335,7 +335,7 @@ private ZkWriteCommand updateState(final ClusterState prevState, ZkNodeProps mes slice = new Slice(sliceName, replicas, sliceProps); DocCollection newCollection = CollectionMutator.updateSlice(collectionName, collection, slice); - log.debug("Collection is now: " + newCollection); + log.debug("Collection is now: {}", newCollection); return new ZkWriteCommand(collectionName, newCollection); } diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java index b1e969197a5d..5436998f924a 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/SliceMutator.java @@ -67,8 +67,6 @@ public ZkWriteCommand addReplica(ClusterState clusterState, ZkNodeProps message) return ZkStateWriter.NO_OP; } String coreNodeName = Assign.assignNode(collection); -// Replica replica = new Replica(coreNodeName, - // coreNodeName overlaps? Replica replica = new Replica(coreNodeName, makeMap( ZkStateReader.CORE_NAME_PROP, message.getStr(ZkStateReader.CORE_NAME_PROP), @@ -257,8 +255,8 @@ public static DocCollection updateReplica(DocCollection collection, final Slice replicasCopy.put(replica.getName(), replica); } Slice newSlice = new Slice(slice.getName(), replicasCopy, slice.getProperties()); - log.info("Old Slice: " + slice); - log.info("New Slice: " + newSlice); + log.debug("Old Slice: {}", slice); + log.debug("New Slice: {}", newSlice); return CollectionMutator.updateSlice(collection.getName(), collection, newSlice); } } diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java index a906b86d8465..880c5ac45aaf 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java @@ -268,7 +268,7 @@ public ClusterState writePendingUpdates() throws IllegalStateException, KeeperEx } } - log.debug("New Cluster State is: " + clusterState); + log.trace("New Cluster State is: {}", clusterState); return clusterState; } diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index a4f15f7f3f83..773c50c6614a 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -1280,7 +1280,8 @@ public void unload(String name, boolean deleteIndexDir, boolean deleteDataDir, b if (zkSys.getZkController() != null) { // cancel recovery in cloud mode core.getSolrCoreState().cancelRecovery(); - if (core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.PASSIVE) { // TODO: Also for Replica.Type.ACTIVE? + if (core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.PASSIVE + || core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.APPEND) { // Stop replication if this is part of a passive replica before closing the code zkSys.getZkController().stopReplicationFromLeader(name); } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 31ad4f78a3dd..1710da9103a4 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -171,9 +171,7 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw final CallInfo callInfo = new CallInfo(this, req, rsp, op); if (taskId == null) { - log.info("Starting Operation: " + req); callInfo.call(); - log.info("Done with Operation: " + req); } else { try { MDC.put("CoreAdminHandler.asyncId", taskId); diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java index c4ee76b0476e..d785868db320 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java @@ -108,6 +108,7 @@ public void process(ResponseBuilder rb) throws IOException if (req.getCore().getCoreDescriptor().getCloudDescriptor() != null && !req.getCore().getCoreDescriptor().getCloudDescriptor().requiresTransactionLog()) { + //nocommit: forward request to leader return; } diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java index f19b2dfb0e65..bc2afa879c34 100644 --- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java +++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java @@ -16,7 +16,6 @@ */ package org.apache.solr.update; -import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.concurrent.ExecutionException; @@ -33,8 +32,8 @@ import org.apache.lucene.search.Sort; import org.apache.solr.cloud.ActionThrottle; import org.apache.solr.cloud.RecoveryStrategy; -import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.SolrException; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.core.DirectoryFactory; @@ -66,8 +65,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover private DirectoryFactory directoryFactory; private final RecoveryStrategy.Builder recoveryStrategyBuilder; - private volatile RecoveryStrategy recoveryStrat; //nocommit: Make interface -// private volatile Thread recoveryStrat; + private volatile RecoveryStrategy recoveryStrat; private volatile boolean lastReplicationSuccess = true; @@ -367,11 +365,9 @@ public void run() { public void cancelRecovery() { if (recoveryStrat != null) { try { - ((Closeable)recoveryStrat).close(); + recoveryStrat.close(); } catch (NullPointerException e) { // okay - } catch (IOException e) { - // okay } } } diff --git a/solr/core/src/java/org/apache/solr/update/UpdateCommand.java b/solr/core/src/java/org/apache/solr/update/UpdateCommand.java index 6c0fc50217bc..b124271d977d 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateCommand.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateCommand.java @@ -34,7 +34,6 @@ public abstract class UpdateCommand implements Cloneable { public static int PEER_SYNC = 0x00000004; // update command is a missing update being provided by a peer. public static int IGNORE_AUTOCOMMIT = 0x00000008; // this update should not count toward triggering of autocommits. public static int CLEAR_CACHES = 0x00000010; // clear caches associated with the update log. used when applying reordered DBQ updates when doing an add. - // TODO: rename to something like "APPEND_REPLICAS_IGNORE_IW", or maybe just "FROM_LEADER"? public static int IGNORE_INDEXWRITER = 0x00000020; public UpdateCommand(SolrQueryRequest req) { diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java index 41fde185dee8..52cb034f5709 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java @@ -1979,7 +1979,6 @@ private List getCollectionUrls(SolrQueryRequest req, String collection, En for (Entry entry : shardMap.entrySet()) { if (!types.contains(entry.getValue().getType())) { - log.info("getCollectionUrls: Skipping replica " + entry.getValue().getName());//nocommit: too verbose continue; } ZkCoreNodeProps nodeProps = new ZkCoreNodeProps(entry.getValue()); diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java index f5c5db0ec86e..b0721a2b8e07 100644 --- a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java @@ -1210,7 +1210,7 @@ public void testExternalClusterStateChangeBehavior() throws Exception { ZkStateReader.STATE_PROP, Replica.State.RECOVERING.toString()); q.offer(Utils.toJSON(m)); - + m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.STATE.toLower(), ZkStateReader.BASE_URL_PROP, "http://127.0.0.1/solr", @@ -1221,7 +1221,7 @@ public void testExternalClusterStateChangeBehavior() throws Exception { ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString()); q.offer(Utils.toJSON(m)); - + Stat stat = new Stat(); byte[] data = zkClient.getData("/clusterstate.json", null, stat, true); // Simulate an external modification diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java index 2cd716c25775..3f84e1e89455 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java @@ -205,7 +205,7 @@ private Map makeReplicas(Map genericReplicas) { private Replica findLeader() { for (Replica replica : replicas.values()) { if (replica.getStr(LEADER) != null) { - assert replica.getType() == Type.APPEND || replica.getType() == Type.REALTIME; + assert replica.getType() == Type.APPEND || replica.getType() == Type.REALTIME: "Passive replica should not become leader!"; return replica; } } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index af7ab2618cd6..625230fec81d 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -791,7 +791,7 @@ public List getReplicaProps(String collection, String shardId, public List getReplicaProps(String collection, String shardId, String thisCoreNodeName, Replica.State mustMatchStateFilter, Replica.State mustNotMatchStateFilter) { - //nocommit + //nocommit: We don't need all these getReplicaProps overloading. Also, it's odd that the default is to return replicas of type APPEND and REALTIME only return getReplicaProps(collection, shardId, thisCoreNodeName, mustMatchStateFilter, null, EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)); } diff --git a/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java b/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java index e64892dc6191..3fc6157c595a 100644 --- a/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java +++ b/solr/solrj/src/java/org/apache/solr/common/params/CoreAdminParams.java @@ -129,7 +129,7 @@ public abstract class CoreAdminParams public static final String NEW_COLLECTION = "newCollection"; /** - * Tells the CoreAdminHandler that the new Core will be a replica of this type. + * Tells the CoreAdminHandler that the new Core will be a replica of a particular {@link org.apache.solr.common.cloud.Replica.Type} */ public static final String REPLICA_TYPE = "replicaType"; diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java index e5d261968973..7141eedbc991 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java @@ -258,7 +258,6 @@ public static void verifyReplicaStatus(ZkStateReader reader, String collection, int maxIterations = 100; Replica.State coreState = null; while(maxIterations-->0) { - System.out.println("ClusterState" + reader.getClusterState()); Slice slice = reader.getClusterState().getSlice(collection, shard); if(slice!=null) { Replica replica = slice.getReplicasMap().get(coreNodeName); @@ -267,8 +266,6 @@ public static void verifyReplicaStatus(ZkStateReader reader, String collection, if(coreState == expectedState) { return; } - } else { - System.out.println(slice.getReplicasMap()); } } Thread.sleep(50); From 415f32339b92662a1ca44f977b06f1898a045752 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Mon, 1 May 2017 13:28:19 -0700 Subject: [PATCH 10/41] Some improvements and better logging for ChaosMonkey tests --- ...yNothingIsSafeWithPassiveReplicasTest.java | 73 +++++++++++++++---- ...nkeySafeLeaderWithPassiveReplicasTest.java | 4 +- 2 files changed, 61 insertions(+), 16 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index 782e3dd2f8fb..87bbb97342df 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -257,13 +257,13 @@ public void test() throws Exception { } } - waitForReplicationFromPassiveReplicas(DEFAULT_COLLECTION, zkStateReader, new TimeOut(30, TimeUnit.SECONDS)); + waitForReplicationFromReplicas(DEFAULT_COLLECTION, zkStateReader, new TimeOut(30, TimeUnit.SECONDS)); Set addFails = getAddFails(indexTreads); Set deleteFails = getDeleteFails(indexTreads); // full throttle thread can - // have request fails - checkShardConsistency(!runFullThrottle, true, addFails, deleteFails); + // have request fails + checkShardConsistency(!runFullThrottle, true, addFails, deleteFails); long ctrlDocs = controlClient.query(new SolrQuery("*:*")).getResults() .getNumFound(); @@ -286,9 +286,9 @@ public void test() throws Exception { } try (CloudSolrClient client = createCloudClient("collection1")) { - createCollection(null, "testcollection", - 1, 1, 1, client, null, "conf1"); - + // We don't really know how many live nodes we have at this point, so "maxShardsPerNode" needs to be > 1 + createCollection(null, "testcollection", + 1, 1, 10, client, null, "conf1"); } List numShardsNumReplicas = new ArrayList<>(2); numShardsNumReplicas.add(1); @@ -298,12 +298,32 @@ public void test() throws Exception { testSuccessful = true; } finally { if (!testSuccessful) { + logReplicaTypesReplicationInfo(DEFAULT_COLLECTION, cloudClient.getZkStateReader()); printLayout(); } } } - private void waitForReplicationFromPassiveReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException { + private void logReplicaTypesReplicationInfo(String collectionName, ZkStateReader zkStateReader) throws KeeperException, InterruptedException, IOException { + log.info("## Extra Replica.Type information of the cluster"); + zkStateReader.forceUpdateCollection(collectionName); + DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); + for(Slice s:collection.getSlices()) { + Replica leader = s.getLeader(); + for (Replica r:s.getReplicas()) { + if (!zkStateReader.getClusterState().liveNodesContain(r.getNodeName())) { + log.info("Replica {} not in liveNodes", r.getName()); + continue; + } + if (r.equals(leader)) { + log.info("Replica {} is leader", r.getName()); + } + logReplicationDetails(r); + } + } + } + + private void waitForReplicationFromReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException { zkStateReader.forceUpdateCollection(collectionName); DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); for(Slice s:collection.getSlices()) { @@ -311,29 +331,36 @@ private void waitForReplicationFromPassiveReplicas(String collectionName, ZkStat long leaderIndexVersion = -1; while (leaderIndexVersion == -1 && !timeout.hasTimedOut()) { leaderIndexVersion = getIndexVersion(leader); - Thread.sleep(1000); + if (leaderIndexVersion < 0) { + Thread.sleep(1000); + } } - for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE))) { + for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE,Replica.Type.APPEND))) { if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { continue; } while (true) { long replicaIndexVersion = getIndexVersion(passiveReplica); - if (leaderIndexVersion > replicaIndexVersion) { + if (leaderIndexVersion == replicaIndexVersion) { + log.debug("Leader replica's version ({}) in sync with replica({}): {} == {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); + break; + } else { if (timeout.hasTimedOut()) { + logReplicaTypesReplicationInfo(collectionName, zkStateReader); fail(String.format(Locale.ROOT, "Timed out waiting for replica %s (%d) to replicate from leader %s (%d)", passiveReplica.getName(), replicaIndexVersion, leader.getName(), leaderIndexVersion)); } - log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); + if (leaderIndexVersion > replicaIndexVersion) { + log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); + } else { + log.debug("Leader replica's version ({}) is lower than passive replica({}): {} < {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); + } Thread.sleep(1000); - } else { - break; } } } } } - @SuppressWarnings("unchecked") private long getIndexVersion(Replica replica) throws IOException { try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { ModifiableSolrParams params = new ModifiableSolrParams(); @@ -341,13 +368,29 @@ private long getIndexVersion(Replica replica) throws IOException { params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_SHOW_COMMITS); try { QueryResponse response = client.query(params); - return (Long)((List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS)).get(0).get("indexVersion"); + @SuppressWarnings("unchecked") + List> commits = (List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS); + return (Long)commits.get(commits.size() - 1).get("indexVersion"); } catch (SolrServerException e) { log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); return -1; } } } + + private void logReplicationDetails(Replica replica) throws IOException { + try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set("qt", "/replication"); + params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_DETAILS); + try { + QueryResponse response = client.query(params); + log.info("{}: {}", replica.getName(), response.getResponse()); + } catch (SolrServerException e) { + log.warn("Unable to ger replication details for replica {}", replica.getName(), e); + } + } + } private Set getAddFails(List threads) { Set addFails = new HashSet(); diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java index f3b14810c9d7..df89adb63fe6 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -277,7 +277,9 @@ private long getIndexVersion(Replica replica) throws IOException { params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_SHOW_COMMITS); try { QueryResponse response = client.query(params); - return (Long)((List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS)).get(0).get("indexVersion"); + List> commits = (List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS); + System.out.println(commits); //TODO: How to get the correct indexVersion from slave? + return (Long)commits.get(0).get("indexVersion"); } catch (SolrServerException e) { log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); return -1; From ccea10536a00b1ce435245d13763ea7dadde0c5d Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Mon, 1 May 2017 14:47:47 -0700 Subject: [PATCH 11/41] Reduce code duplication in ChaosMonkey tests --- .../solr/cloud/BasicDistributedZkTest.java | 6 - .../cloud/ChaosMonkeyNothingIsSafeTest.java | 132 +--------- ...yNothingIsSafeWithPassiveReplicasTest.java | 227 +----------------- ...nkeySafeLeaderWithPassiveReplicasTest.java | 60 +---- .../FullThrottleStoppableIndexingThread.java | 154 ++++++++++++ .../solr/BaseDistributedSearchTestCase.java | 2 +- .../cloud/AbstractFullDistribZkTestBase.java | 97 +++++++- .../org/apache/solr/cloud/ChaosMonkey.java | 7 + 8 files changed, 264 insertions(+), 421 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java index 8d0839f018e5..162164b01424 100644 --- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java @@ -1075,12 +1075,6 @@ private void testMultipleCollections() throws Exception { assertEquals(collection3Docs, collection2Docs - 1); } - protected SolrInputDocument getDoc(Object... fields) throws Exception { - SolrInputDocument doc = new SolrInputDocument(); - addFields(doc, fields); - return doc; - } - protected void indexDoc(String collection, SolrInputDocument doc) throws IOException, SolrServerException { List clients = otherCollectionClients.get(collection); int which = (doc.getField(id).toString().hashCode() & 0x7fffffff) % clients.size(); diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java index a389005cb66a..0130dc822dc0 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java @@ -16,33 +16,21 @@ */ package org.apache.solr.cloud; -import java.lang.invoke.MethodHandles; -import java.net.ConnectException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.http.client.HttpClient; -import org.apache.http.impl.client.CloseableHttpClient; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker; import org.apache.solr.SolrTestCaseJ4.SuppressSSL; -import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; -import org.apache.solr.client.solrj.impl.HttpClientUtil; -import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.ZkStateReader; -import org.apache.solr.common.util.IOUtils; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; @Slow @SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") @@ -51,8 +39,6 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase { private static final int FAIL_TOLERANCE = 100; - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private static final Integer RUN_LENGTH = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.runlength", "-1")); private final boolean onlyLeaderIndexes = random().nextBoolean(); @@ -158,8 +144,8 @@ public void test() throws Exception { // it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer boolean runFullThrottle = random().nextBoolean(); if (runFullThrottle) { - FullThrottleStoppableIndexingThread ftIndexThread = new FullThrottleStoppableIndexingThread( - clients, "ft1", true); + FullThrottleStoppableIndexingThread ftIndexThread = + new FullThrottleStoppableIndexingThread(controlClient, cloudClient, clients, "ft1", true, this.clientSoTimeout); threads.add(ftIndexThread); ftIndexThread.start(); } @@ -289,111 +275,6 @@ private Set getDeleteFails(List threads) { return deleteFails; } - class FullThrottleStoppableIndexingThread extends StoppableIndexingThread { - private CloseableHttpClient httpClient = HttpClientUtil.createClient(null); - private volatile boolean stop = false; - int clientIndex = 0; - private ConcurrentUpdateSolrClient cusc; - private List clients; - private AtomicInteger fails = new AtomicInteger(); - - public FullThrottleStoppableIndexingThread(List clients, - String id, boolean doDeletes) { - super(controlClient, cloudClient, id, doDeletes); - setName("FullThrottleStopableIndexingThread"); - setDaemon(true); - this.clients = clients; - - cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(0)).getBaseURL(), httpClient, 8, 2); - cusc.setConnectionTimeout(10000); - cusc.setSoTimeout(clientSoTimeout); - } - - @Override - public void run() { - int i = 0; - int numDeletes = 0; - int numAdds = 0; - - while (true && !stop) { - String id = this.id + "-" + i; - ++i; - - if (doDeletes && random().nextBoolean() && deletes.size() > 0) { - String delete = deletes.remove(0); - try { - numDeletes++; - cusc.deleteById(delete); - } catch (Exception e) { - changeUrlOnError(e); - fails.incrementAndGet(); - } - } - - try { - numAdds++; - if (numAdds > (TEST_NIGHTLY ? 4002 : 197)) - continue; - SolrInputDocument doc = getDoc( - "id", - id, - i1, - 50, - t1, - "Saxon heptarchies that used to rip around so in old times and raise Cain. My, you ought to seen old Henry the Eight when he was in bloom. He WAS a blossom. He used to marry a new wife every day, and chop off her head next morning. And he would do it just as indifferent as if "); - cusc.add(doc); - } catch (Exception e) { - changeUrlOnError(e); - fails.incrementAndGet(); - } - - if (doDeletes && random().nextBoolean()) { - deletes.add(id); - } - - } - - log.info("FT added docs:" + numAdds + " with " + fails + " fails" + " deletes:" + numDeletes); - } - - private void changeUrlOnError(Exception e) { - if (e instanceof ConnectException) { - clientIndex++; - if (clientIndex > clients.size() - 1) { - clientIndex = 0; - } - cusc.shutdownNow(); - cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(clientIndex)).getBaseURL(), - httpClient, 30, 3); - } - } - - @Override - public void safeStop() { - stop = true; - cusc.blockUntilFinished(); - cusc.shutdownNow(); - IOUtils.closeQuietly(httpClient); - } - - @Override - public int getFailCount() { - return fails.get(); - } - - @Override - public Set getAddFails() { - throw new UnsupportedOperationException(); - } - - @Override - public Set getDeleteFails() { - throw new UnsupportedOperationException(); - } - - }; - - // skip the randoms - they can deadlock... @Override protected void indexr(Object... fields) throws Exception { @@ -401,13 +282,4 @@ protected void indexr(Object... fields) throws Exception { indexDoc(doc); } - static class ErrorLoggingConcurrentUpdateSolrClient extends ConcurrentUpdateSolrClient { - public ErrorLoggingConcurrentUpdateSolrClient(String serverUrl, HttpClient httpClient, int queueSize, int threadCount) { - super(serverUrl, httpClient, queueSize, threadCount, null, false); - } - @Override - public void handleError(Throwable ex) { - log.warn("cusc error", ex); - } - } } diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index 87bbb97342df..f75294bfabb9 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -16,53 +16,34 @@ */ package org.apache.solr.cloud; -import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.net.ConnectException; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import org.apache.http.client.HttpClient; -import org.apache.http.impl.client.CloseableHttpClient; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker; import org.apache.solr.SolrTestCaseJ4.SuppressSSL; -import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; -import org.apache.solr.client.solrj.impl.HttpClientUtil; -import org.apache.solr.client.solrj.impl.HttpSolrClient; -import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkStateReader; -import org.apache.solr.common.params.ModifiableSolrParams; -import org.apache.solr.common.util.IOUtils; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.handler.ReplicationHandler; import org.apache.solr.util.TimeOut; -import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; - @Slow @SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") -@ThreadLeakLingering(linger = 60000) +//@ThreadLeakLingering(linger = 60000) @SuppressObjectReleaseTracker(bugUrl="Testing purposes") public class ChaosMonkeyNothingIsSafeWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { private static final int FAIL_TOLERANCE = 100; @@ -188,8 +169,8 @@ public void test() throws Exception { // it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer boolean runFullThrottle = random().nextBoolean(); if (runFullThrottle) { - FullThrottleStoppableIndexingThread ftIndexThread = new FullThrottleStoppableIndexingThread( - clients, "ft1", true); + FullThrottleStoppableIndexingThread ftIndexThread = + new FullThrottleStoppableIndexingThread(controlClient, cloudClient, clients, "ft1", true, this.clientSoTimeout); threads.add(ftIndexThread); ftIndexThread.start(); } @@ -304,94 +285,6 @@ public void test() throws Exception { } } - private void logReplicaTypesReplicationInfo(String collectionName, ZkStateReader zkStateReader) throws KeeperException, InterruptedException, IOException { - log.info("## Extra Replica.Type information of the cluster"); - zkStateReader.forceUpdateCollection(collectionName); - DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); - for(Slice s:collection.getSlices()) { - Replica leader = s.getLeader(); - for (Replica r:s.getReplicas()) { - if (!zkStateReader.getClusterState().liveNodesContain(r.getNodeName())) { - log.info("Replica {} not in liveNodes", r.getName()); - continue; - } - if (r.equals(leader)) { - log.info("Replica {} is leader", r.getName()); - } - logReplicationDetails(r); - } - } - } - - private void waitForReplicationFromReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException { - zkStateReader.forceUpdateCollection(collectionName); - DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); - for(Slice s:collection.getSlices()) { - Replica leader = s.getLeader(); - long leaderIndexVersion = -1; - while (leaderIndexVersion == -1 && !timeout.hasTimedOut()) { - leaderIndexVersion = getIndexVersion(leader); - if (leaderIndexVersion < 0) { - Thread.sleep(1000); - } - } - for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE,Replica.Type.APPEND))) { - if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { - continue; - } - while (true) { - long replicaIndexVersion = getIndexVersion(passiveReplica); - if (leaderIndexVersion == replicaIndexVersion) { - log.debug("Leader replica's version ({}) in sync with replica({}): {} == {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); - break; - } else { - if (timeout.hasTimedOut()) { - logReplicaTypesReplicationInfo(collectionName, zkStateReader); - fail(String.format(Locale.ROOT, "Timed out waiting for replica %s (%d) to replicate from leader %s (%d)", passiveReplica.getName(), replicaIndexVersion, leader.getName(), leaderIndexVersion)); - } - if (leaderIndexVersion > replicaIndexVersion) { - log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); - } else { - log.debug("Leader replica's version ({}) is lower than passive replica({}): {} < {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); - } - Thread.sleep(1000); - } - } - } - } - } - - private long getIndexVersion(Replica replica) throws IOException { - try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { - ModifiableSolrParams params = new ModifiableSolrParams(); - params.set("qt", "/replication"); - params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_SHOW_COMMITS); - try { - QueryResponse response = client.query(params); - @SuppressWarnings("unchecked") - List> commits = (List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS); - return (Long)commits.get(commits.size() - 1).get("indexVersion"); - } catch (SolrServerException e) { - log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); - return -1; - } - } - } - - private void logReplicationDetails(Replica replica) throws IOException { - try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { - ModifiableSolrParams params = new ModifiableSolrParams(); - params.set("qt", "/replication"); - params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_DETAILS); - try { - QueryResponse response = client.query(params); - log.info("{}: {}", replica.getName(), response.getResponse()); - } catch (SolrServerException e) { - log.warn("Unable to ger replication details for replica {}", replica.getName(), e); - } - } - } - private Set getAddFails(List threads) { Set addFails = new HashSet(); for (StoppableIndexingThread thread : threads) { @@ -408,111 +301,6 @@ private Set getDeleteFails(List threads) { return deleteFails; } - class FullThrottleStoppableIndexingThread extends StoppableIndexingThread { - private CloseableHttpClient httpClient = HttpClientUtil.createClient(null); - private volatile boolean stop = false; - int clientIndex = 0; - private ConcurrentUpdateSolrClient cusc; - private List clients; - private AtomicInteger fails = new AtomicInteger(); - - public FullThrottleStoppableIndexingThread(List clients, - String id, boolean doDeletes) { - super(controlClient, cloudClient, id, doDeletes); - setName("FullThrottleStopableIndexingThread"); - setDaemon(true); - this.clients = clients; - - cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(0)).getBaseURL(), httpClient, 8, 2); - cusc.setConnectionTimeout(10000); - cusc.setSoTimeout(clientSoTimeout); - } - - @Override - public void run() { - int i = 0; - int numDeletes = 0; - int numAdds = 0; - - while (true && !stop) { - String id = this.id + "-" + i; - ++i; - - if (doDeletes && random().nextBoolean() && deletes.size() > 0) { - String delete = deletes.remove(0); - try { - numDeletes++; - cusc.deleteById(delete); - } catch (Exception e) { - changeUrlOnError(e); - fails.incrementAndGet(); - } - } - - try { - numAdds++; - if (numAdds > (TEST_NIGHTLY ? 4002 : 197)) - continue; - SolrInputDocument doc = getDoc( - "id", - id, - i1, - 50, - t1, - "Saxon heptarchies that used to rip around so in old times and raise Cain. My, you ought to seen old Henry the Eight when he was in bloom. He WAS a blossom. He used to marry a new wife every day, and chop off her head next morning. And he would do it just as indifferent as if "); - cusc.add(doc); - } catch (Exception e) { - changeUrlOnError(e); - fails.incrementAndGet(); - } - - if (doDeletes && random().nextBoolean()) { - deletes.add(id); - } - - } - - log.info("FT added docs:" + numAdds + " with " + fails + " fails" + " deletes:" + numDeletes); - } - - private void changeUrlOnError(Exception e) { - if (e instanceof ConnectException) { - clientIndex++; - if (clientIndex > clients.size() - 1) { - clientIndex = 0; - } - cusc.shutdownNow(); - cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(clientIndex)).getBaseURL(), - httpClient, 30, 3); - } - } - - @Override - public void safeStop() { - stop = true; - cusc.blockUntilFinished(); - cusc.shutdownNow(); - IOUtils.closeQuietly(httpClient); - } - - @Override - public int getFailCount() { - return fails.get(); - } - - @Override - public Set getAddFails() { - throw new UnsupportedOperationException(); - } - - @Override - public Set getDeleteFails() { - throw new UnsupportedOperationException(); - } - - }; - - // skip the randoms - they can deadlock... @Override protected void indexr(Object... fields) throws Exception { @@ -520,13 +308,4 @@ protected void indexr(Object... fields) throws Exception { indexDoc(doc); } - static class ErrorLoggingConcurrentUpdateSolrClient extends ConcurrentUpdateSolrClient { - public ErrorLoggingConcurrentUpdateSolrClient(String serverUrl, HttpClient httpClient, int queueSize, int threadCount) { - super(serverUrl, httpClient, queueSize, threadCount, null, false); - } - @Override - public void handleError(Throwable ex) { - log.warn("cusc error", ex); - } - } } diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java index df89adb63fe6..582da646580a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -16,12 +16,10 @@ */ package org.apache.solr.cloud; -import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; -import java.util.Locale; import java.util.concurrent.TimeUnit; import org.apache.lucene.util.LuceneTestCase.Slow; @@ -29,18 +27,11 @@ import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.impl.HttpSolrClient; -import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; -import org.apache.solr.common.cloud.ZkStateReader; -import org.apache.solr.common.params.ModifiableSolrParams; -import org.apache.solr.common.util.NamedList; -import org.apache.solr.handler.ReplicationHandler; import org.apache.solr.util.TimeOut; -import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -201,7 +192,7 @@ public void test() throws Exception { log.info("control docs:" + controlClient.query(new SolrQuery("*:*")).getResults().getNumFound() + "\n\n"); - waitForReplicationFromPassiveReplicas(DEFAULT_COLLECTION, cloudClient.getZkStateReader(), new TimeOut(30, TimeUnit.SECONDS)); + waitForReplicationFromReplicas(DEFAULT_COLLECTION, cloudClient.getZkStateReader(), new TimeOut(30, TimeUnit.SECONDS)); checkShardConsistency(batchSize == 1, true); @@ -239,55 +230,6 @@ private void tryDelete() throws Exception { } } - private void waitForReplicationFromPassiveReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException { - zkStateReader.forceUpdateCollection(collectionName); - DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); - for(Slice s:collection.getSlices()) { - Replica leader = s.getLeader(); - long leaderIndexVersion = -1; - while (leaderIndexVersion == -1 && !timeout.hasTimedOut()) { - leaderIndexVersion = getIndexVersion(leader); - Thread.sleep(1000); - } - for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE))) { - if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { - continue; - } - while (true) { - long replicaIndexVersion = getIndexVersion(passiveReplica); - if (leaderIndexVersion > replicaIndexVersion) { - if (timeout.hasTimedOut()) { - fail(String.format(Locale.ROOT, "Timed out waiting for replica %s to replicate from leader %s", passiveReplica.getName(), leader.getName())); - } - log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); - Thread.sleep(1000); - } else { - break; - } - } - } - } - } - - @SuppressWarnings("unchecked") - private long getIndexVersion(Replica replica) throws IOException { - try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { - ModifiableSolrParams params = new ModifiableSolrParams(); - params.set("qt", "/replication"); - params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_SHOW_COMMITS); - try { - QueryResponse response = client.query(params); - List> commits = (List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS); - System.out.println(commits); //TODO: How to get the correct indexVersion from slave? - return (Long)commits.get(0).get("indexVersion"); - } catch (SolrServerException e) { - log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); - return -1; - } - } - } - - // skip the randoms - they can deadlock... @Override protected void indexr(Object... fields) throws Exception { diff --git a/solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java b/solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java new file mode 100644 index 000000000000..e12a75e5cce6 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.lang.invoke.MethodHandles; +import java.net.ConnectException; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.http.client.HttpClient; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; +import org.apache.solr.client.solrj.impl.HttpClientUtil; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.util.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class FullThrottleStoppableIndexingThread extends StoppableIndexingThread { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + /** + * + */ + private CloseableHttpClient httpClient = HttpClientUtil.createClient(null); + private volatile boolean stop = false; + int clientIndex = 0; + private ConcurrentUpdateSolrClient cusc; + private List clients; + private AtomicInteger fails = new AtomicInteger(); + + public FullThrottleStoppableIndexingThread(SolrClient controlClient, SolrClient cloudClient, List clients, + String id, boolean doDeletes, int clientSoTimeout) { + super(controlClient, cloudClient, id, doDeletes); + setName("FullThrottleStopableIndexingThread"); + setDaemon(true); + this.clients = clients; + + cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(0)).getBaseURL(), httpClient, 8, 2); + cusc.setConnectionTimeout(10000); + cusc.setSoTimeout(clientSoTimeout); + } + + @Override + public void run() { + int i = 0; + int numDeletes = 0; + int numAdds = 0; + + while (true && !stop) { + String id = this.id + "-" + i; + ++i; + + if (doDeletes && ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.random().nextBoolean() && deletes.size() > 0) { + String delete = deletes.remove(0); + try { + numDeletes++; + cusc.deleteById(delete); + } catch (Exception e) { + changeUrlOnError(e); + fails.incrementAndGet(); + } + } + + try { + numAdds++; + if (numAdds > (ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.TEST_NIGHTLY ? 4002 : 197)) + continue; + SolrInputDocument doc = AbstractFullDistribZkTestBase.getDoc( + "id", + id, + i1, + 50, + t1, + "Saxon heptarchies that used to rip around so in old times and raise Cain. My, you ought to seen old Henry the Eight when he was in bloom. He WAS a blossom. He used to marry a new wife every day, and chop off her head next morning. And he would do it just as indifferent as if "); + cusc.add(doc); + } catch (Exception e) { + changeUrlOnError(e); + fails.incrementAndGet(); + } + + if (doDeletes && ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.random().nextBoolean()) { + deletes.add(id); + } + + } + + ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.log.info("FT added docs:" + numAdds + " with " + fails + " fails" + " deletes:" + numDeletes); + } + + private void changeUrlOnError(Exception e) { + if (e instanceof ConnectException) { + clientIndex++; + if (clientIndex > clients.size() - 1) { + clientIndex = 0; + } + cusc.shutdownNow(); + cusc = new ErrorLoggingConcurrentUpdateSolrClient(((HttpSolrClient) clients.get(clientIndex)).getBaseURL(), + httpClient, 30, 3); + } + } + + @Override + public void safeStop() { + stop = true; + cusc.blockUntilFinished(); + cusc.shutdownNow(); + IOUtils.closeQuietly(httpClient); + } + + @Override + public int getFailCount() { + return fails.get(); + } + + @Override + public Set getAddFails() { + throw new UnsupportedOperationException(); + } + + @Override + public Set getDeleteFails() { + throw new UnsupportedOperationException(); + } + + static class ErrorLoggingConcurrentUpdateSolrClient extends ConcurrentUpdateSolrClient { + @SuppressWarnings("deprecation") + public ErrorLoggingConcurrentUpdateSolrClient(String serverUrl, HttpClient httpClient, int queueSize, int threadCount) { + super(serverUrl, httpClient, queueSize, threadCount, null, false); + } + @Override + public void handleError(Throwable ex) { + log.warn("cusc error", ex); + } + } + +} \ No newline at end of file diff --git a/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java b/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java index 9213fee1625c..ed778a6be6a4 100644 --- a/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java +++ b/solr/test-framework/src/java/org/apache/solr/BaseDistributedSearchTestCase.java @@ -451,7 +451,7 @@ protected String buildUrl(int port) { return buildUrl(port, context); } - protected void addFields(SolrInputDocument doc, Object... fields) { + protected static void addFields(SolrInputDocument doc, Object... fields) { for (int i = 0; i < fields.length; i += 2) { doc.addField((String) (fields[i]), fields[i + 1]); } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java index acd425b2026f..f48ac3aa2bff 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java @@ -25,9 +25,11 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; @@ -74,11 +76,13 @@ import org.apache.solr.core.CoreContainer; import org.apache.solr.core.Diagnostics; import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.ReplicationHandler; import org.apache.solr.update.DirectUpdateHandler2; import org.apache.solr.update.SolrCmdDistributor; import org.apache.solr.util.RTimer; import org.apache.solr.util.TimeOut; import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.noggit.CharArr; @@ -1717,7 +1721,7 @@ protected String getBaseUrl(HttpSolrClient client) { - DEFAULT_COLLECTION.length() - 1); } - protected SolrInputDocument getDoc(Object... fields) throws Exception { + public static SolrInputDocument getDoc(Object... fields) throws Exception { SolrInputDocument doc = new SolrInputDocument(); addFields(doc, fields); return doc; @@ -2024,6 +2028,97 @@ protected boolean reloadCollection(Replica replica, String testCollectionName) t } return reloadedOk; } + + + protected void logReplicaTypesReplicationInfo(String collectionName, ZkStateReader zkStateReader) throws KeeperException, InterruptedException, IOException { + log.info("## Collecting extra Replica.Type information of the cluster"); + StringBuilder builder = new StringBuilder(); + zkStateReader.forceUpdateCollection(collectionName); + DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); + for(Slice s:collection.getSlices()) { + Replica leader = s.getLeader(); + for (Replica r:s.getReplicas()) { + if (!r.isActive(zkStateReader.getClusterState().getLiveNodes())) { + builder.append(String.format(Locale.ROOT, "Replica %s not in liveNodes or is not active%s", r.getName(), System.lineSeparator())); + continue; + } + if (r.equals(leader)) { + builder.append(String.format(Locale.ROOT, "Replica %s is leader%s", r.getName(), System.lineSeparator())); + } + logReplicationDetails(r, builder); + } + } + log.info("Summary of the cluster: " + builder.toString()); + } + + protected void waitForReplicationFromReplicas(String collectionName, ZkStateReader zkStateReader, TimeOut timeout) throws KeeperException, InterruptedException, IOException { + zkStateReader.forceUpdateCollection(collectionName); + DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); + for(Slice s:collection.getSlices()) { + Replica leader = s.getLeader(); + long leaderIndexVersion = -1; + while (leaderIndexVersion == -1 && !timeout.hasTimedOut()) { + leaderIndexVersion = getIndexVersion(leader); + if (leaderIndexVersion < 0) { + Thread.sleep(1000); + } + } + for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE,Replica.Type.APPEND))) { + if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { + continue; + } + while (true) { + long replicaIndexVersion = getIndexVersion(passiveReplica); + if (leaderIndexVersion == replicaIndexVersion) { + log.debug("Leader replica's version ({}) in sync with replica({}): {} == {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); + break; + } else { + if (timeout.hasTimedOut()) { + logReplicaTypesReplicationInfo(collectionName, zkStateReader); + fail(String.format(Locale.ROOT, "Timed out waiting for replica %s (%d) to replicate from leader %s (%d)", passiveReplica.getName(), replicaIndexVersion, leader.getName(), leaderIndexVersion)); + } + if (leaderIndexVersion > replicaIndexVersion) { + log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); + } else { + log.debug("Leader replica's version ({}) is lower than passive replica({}): {} < {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); + } + Thread.sleep(1000); + } + } + } + } + } + + protected long getIndexVersion(Replica replica) throws IOException { + try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set("qt", "/replication"); + params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_SHOW_COMMITS); + try { + QueryResponse response = client.query(params); + @SuppressWarnings("unchecked") + List> commits = (List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS); + return (Long)commits.get(commits.size() - 1).get("indexVersion"); + } catch (SolrServerException e) { + log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); + return -1; + } + } + } + + protected void logReplicationDetails(Replica replica, StringBuilder builder) throws IOException { + try (HttpSolrClient client = new HttpSolrClient.Builder(replica.getCoreUrl()).build()) { + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set("qt", "/replication"); + params.set(ReplicationHandler.COMMAND, ReplicationHandler.CMD_DETAILS); + try { + QueryResponse response = client.query(params); + builder.append(String.format(Locale.ROOT, "%s: %s%s", replica.getName(), response.getResponse(), System.lineSeparator())); + } catch (SolrServerException e) { + log.warn("Unable to ger replication details for replica {}", replica.getName(), e); + } + } + } static RequestStatusState getRequestStateAfterCompletion(String requestId, int waitForSeconds, SolrClient client) throws IOException, SolrServerException { diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index b9080c5a1dee..689f0ee29f19 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -688,6 +688,13 @@ public static boolean start(JettySolrRunner jetty) throws Exception { return true; } + /** + * You can call this method to wait while the ChaosMonkey is running, it waits approximately the specified time, and periodically + * logs the status of the collection + * @param runLength The time in ms to wait + * @param collectionName The main collection being used for the ChaosMonkey + * @param zkStateReader current state reader + */ public static void wait(long runLength, String collectionName, ZkStateReader zkStateReader) throws InterruptedException { TimeOut t = new TimeOut(runLength, TimeUnit.MILLISECONDS); while (!t.hasTimedOut()) { From e9f3b3a51bd0626e53517fcd8518225cb1f688b4 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 3 May 2017 11:17:41 -0700 Subject: [PATCH 12/41] Added logging --- .../src/java/org/apache/solr/cloud/ReplicateFromLeader.java | 1 + solr/core/src/java/org/apache/solr/cloud/ZkController.java | 4 ++-- .../src/java/org/apache/solr/common/cloud/ZkStateReader.java | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index b0bca44a7ddf..44410186dbed 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -70,6 +70,7 @@ public void startReplication(boolean switchTransactionLog) throws InterruptedExc } else if (uinfo.autoSoftCommmitMaxTime != -1) { pollIntervalStr = toPollIntervalStr(uinfo.autoSoftCommmitMaxTime/2); } + LOG.info("Will start replication from leader with poll interval: {}", pollIntervalStr ); NamedList slaveConfig = new NamedList(); slaveConfig.add("fetchFromLeader", true); diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index e4e98da293fd..509a74db57ba 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -969,7 +969,7 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov } public void startReplicationFromLeader(String coreName, boolean switchTransactionLog) throws InterruptedException { - log.info("{} starting replication from leader", coreName); + log.info("{} starting background replication from leader", coreName); ReplicateFromLeader replicateFromLeader = new ReplicateFromLeader(cc, coreName); if (replicateFromLeaders.putIfAbsent(coreName, replicateFromLeader) == null) { replicateFromLeader.startReplication(switchTransactionLog); @@ -979,7 +979,7 @@ public void startReplicationFromLeader(String coreName, boolean switchTransactio } public void stopReplicationFromLeader(String coreName) { - log.info("{} stopping replication from leader", coreName); + log.info("{} stopping background replication from leader", coreName); ReplicateFromLeader replicateFromLeader = replicateFromLeaders.remove(coreName); if (replicateFromLeader != null) { replicateFromLeader.stopReplication(); diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index 625230fec81d..c0a6caf6c8f2 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -791,7 +791,7 @@ public List getReplicaProps(String collection, String shardId, public List getReplicaProps(String collection, String shardId, String thisCoreNodeName, Replica.State mustMatchStateFilter, Replica.State mustNotMatchStateFilter) { - //nocommit: We don't need all these getReplicaProps overloading. Also, it's odd that the default is to return replicas of type APPEND and REALTIME only + //nocommit: We don't need all these getReplicaProps method overloading. Also, it's odd that the default is to return replicas of type APPEND and REALTIME only return getReplicaProps(collection, shardId, thisCoreNodeName, mustMatchStateFilter, null, EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)); } From 688f12df4bb2d6588f43a5a67c9752a116b888aa Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 3 May 2017 11:23:31 -0700 Subject: [PATCH 13/41] Minor improvements to ChaosMonkey tests --- ...yNothingIsSafeWithPassiveReplicasTest.java | 22 ++++++++++++++++--- .../FullThrottleStoppableIndexingThread.java | 12 +++++----- .../java/org/apache/solr/SolrTestCaseJ4.java | 17 ++++++++------ .../cloud/AbstractFullDistribZkTestBase.java | 16 +++++++++----- .../org/apache/solr/cloud/ChaosMonkey.java | 2 +- 5 files changed, 48 insertions(+), 21 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index f75294bfabb9..db4539265c2b 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -28,12 +28,14 @@ import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker; import org.apache.solr.SolrTestCaseJ4.SuppressSSL; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.core.SolrCore; import org.apache.solr.util.TimeOut; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -97,8 +99,8 @@ public void distribSetUp() throws Exception { public ChaosMonkeyNothingIsSafeWithPassiveReplicasTest() { super(); - numPassiveReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; - numRealtimeOrAppendReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; + numPassiveReplicas = random().nextInt(TEST_NIGHTLY ? 2 : 1) + 1; + numRealtimeOrAppendReplicas = random().nextInt(TEST_NIGHTLY ? 4 : 3) + 1; sliceCount = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.slicecount", "-1")); if (sliceCount == -1) { sliceCount = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1; @@ -211,7 +213,7 @@ public void test() throws Exception { // try and wait for any replications and what not to finish... - Thread.sleep(2000); + ChaosMonkey.wait(2000, DEFAULT_COLLECTION, zkStateReader); // wait until there are no recoveries... waitForThingsToLevelOut(Integer.MAX_VALUE);//Math.round((runLength / 1000.0f / 3.0f))); @@ -239,6 +241,7 @@ public void test() throws Exception { } waitForReplicationFromReplicas(DEFAULT_COLLECTION, zkStateReader, new TimeOut(30, TimeUnit.SECONDS)); + waitForAllWarmingSearchers(); Set addFails = getAddFails(indexTreads); Set deleteFails = getDeleteFails(indexTreads); @@ -285,10 +288,22 @@ public void test() throws Exception { } } + private void waitForAllWarmingSearchers() throws InterruptedException { + for (JettySolrRunner jetty:jettys) { + if (!jetty.isRunning()) { + continue; + } + for (SolrCore core:jetty.getCoreContainer().getCores()) { + waitForWarming(core); + } + } + } + private Set getAddFails(List threads) { Set addFails = new HashSet(); for (StoppableIndexingThread thread : threads) { addFails.addAll(thread.getAddFails()); +// addFails.addAll(thread.getAddFailsMinRf()); } return addFails; } @@ -297,6 +312,7 @@ private Set getDeleteFails(List threads) { Set deleteFails = new HashSet(); for (StoppableIndexingThread thread : threads) { deleteFails.addAll(thread.getDeleteFails()); +// deleteFails.addAll(thread.getDeleteFailsMinRf()); } return deleteFails; } diff --git a/solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java b/solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java index e12a75e5cce6..b9e177a0d9fc 100644 --- a/solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java +++ b/solr/core/src/test/org/apache/solr/cloud/FullThrottleStoppableIndexingThread.java @@ -24,7 +24,9 @@ import org.apache.http.client.HttpClient; import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; import org.apache.solr.client.solrj.impl.HttpClientUtil; import org.apache.solr.client.solrj.impl.HttpSolrClient; @@ -46,7 +48,7 @@ class FullThrottleStoppableIndexingThread extends StoppableIndexingThread { private List clients; private AtomicInteger fails = new AtomicInteger(); - public FullThrottleStoppableIndexingThread(SolrClient controlClient, SolrClient cloudClient, List clients, + public FullThrottleStoppableIndexingThread(SolrClient controlClient, CloudSolrClient cloudClient, List clients, String id, boolean doDeletes, int clientSoTimeout) { super(controlClient, cloudClient, id, doDeletes); setName("FullThrottleStopableIndexingThread"); @@ -68,7 +70,7 @@ public void run() { String id = this.id + "-" + i; ++i; - if (doDeletes && ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.random().nextBoolean() && deletes.size() > 0) { + if (doDeletes && LuceneTestCase.random().nextBoolean() && deletes.size() > 0) { String delete = deletes.remove(0); try { numDeletes++; @@ -81,7 +83,7 @@ public void run() { try { numAdds++; - if (numAdds > (ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.TEST_NIGHTLY ? 4002 : 197)) + if (numAdds > (LuceneTestCase.TEST_NIGHTLY ? 4002 : 197)) continue; SolrInputDocument doc = AbstractFullDistribZkTestBase.getDoc( "id", @@ -96,13 +98,13 @@ public void run() { fails.incrementAndGet(); } - if (doDeletes && ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.random().nextBoolean()) { + if (doDeletes && LuceneTestCase.random().nextBoolean()) { deletes.add(id); } } - ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.log.info("FT added docs:" + numAdds + " with " + fails + " fails" + " deletes:" + numDeletes); + log.info("FT added docs:" + numAdds + " with " + fails + " fails" + " deletes:" + numDeletes); } private void changeUrlOnError(Exception e) { diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java index c94f24c8a49c..7e443bb87534 100644 --- a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java @@ -2423,24 +2423,27 @@ public static String randomXmlUsableUnicodeString() { } return result; } - - protected void waitForWarming() throws InterruptedException { - RefCounted registeredSearcher = h.getCore().getRegisteredSearcher(); - RefCounted newestSearcher = h.getCore().getNewestSearcher(false); - ; + + protected static void waitForWarming(SolrCore core) throws InterruptedException { + RefCounted registeredSearcher = core.getRegisteredSearcher(); + RefCounted newestSearcher = core.getNewestSearcher(false); while (registeredSearcher == null || registeredSearcher.get() != newestSearcher.get()) { if (registeredSearcher != null) { registeredSearcher.decref(); } newestSearcher.decref(); Thread.sleep(50); - registeredSearcher = h.getCore().getRegisteredSearcher(); - newestSearcher = h.getCore().getNewestSearcher(false); + registeredSearcher = core.getRegisteredSearcher(); + newestSearcher = core.getNewestSearcher(false); } registeredSearcher.decref(); newestSearcher.decref(); } + protected void waitForWarming() throws InterruptedException { + waitForWarming(h.getCore()); + } + @BeforeClass public static void chooseMPForMP() throws Exception { if (random().nextBoolean()) { diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java index f48ac3aa2bff..4d362866ef6f 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java @@ -2032,6 +2032,7 @@ protected boolean reloadCollection(Replica replica, String testCollectionName) t protected void logReplicaTypesReplicationInfo(String collectionName, ZkStateReader zkStateReader) throws KeeperException, InterruptedException, IOException { log.info("## Collecting extra Replica.Type information of the cluster"); + zkStateReader.updateLiveNodes(); StringBuilder builder = new StringBuilder(); zkStateReader.forceUpdateCollection(collectionName); DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); @@ -2057,18 +2058,22 @@ protected void waitForReplicationFromReplicas(String collectionName, ZkStateRead for(Slice s:collection.getSlices()) { Replica leader = s.getLeader(); long leaderIndexVersion = -1; - while (leaderIndexVersion == -1 && !timeout.hasTimedOut()) { + while (!timeout.hasTimedOut()) { leaderIndexVersion = getIndexVersion(leader); - if (leaderIndexVersion < 0) { - Thread.sleep(1000); + if (leaderIndexVersion >= 0) { + break; } + Thread.sleep(1000); + } + if (timeout.hasTimedOut()) { + fail("Unable to get leader indexVersion"); } for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE,Replica.Type.APPEND))) { if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { continue; } while (true) { - long replicaIndexVersion = getIndexVersion(passiveReplica); + long replicaIndexVersion = getIndexVersion(passiveReplica); if (leaderIndexVersion == replicaIndexVersion) { log.debug("Leader replica's version ({}) in sync with replica({}): {} == {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); break; @@ -2098,7 +2103,8 @@ protected long getIndexVersion(Replica replica) throws IOException { QueryResponse response = client.query(params); @SuppressWarnings("unchecked") List> commits = (List>)response.getResponse().get(ReplicationHandler.CMD_SHOW_COMMITS); - return (Long)commits.get(commits.size() - 1).get("indexVersion"); + Collections.max(commits, (a,b)->((Long)a.get("indexVersion")).compareTo((Long)b.get("indexVersion"))); + return (long) Collections.max(commits, (a,b)->((Long)a.get("indexVersion")).compareTo((Long)b.get("indexVersion"))).get("indexVersion"); } catch (SolrServerException e) { log.warn("Exception getting version from {}, will return an invalid version to retry.", replica.getName(), e); return -1; diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index 689f0ee29f19..05a42eea39c3 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -698,7 +698,7 @@ public static boolean start(JettySolrRunner jetty) throws Exception { public static void wait(long runLength, String collectionName, ZkStateReader zkStateReader) throws InterruptedException { TimeOut t = new TimeOut(runLength, TimeUnit.MILLISECONDS); while (!t.hasTimedOut()) { - Thread.sleep(Math.min(1000, runLength)); + Thread.sleep(Math.min(1000, t.timeLeft(TimeUnit.MILLISECONDS))); logCollectionStateSummary(collectionName, zkStateReader); } } From 484263360310f2aed2e24df27729e2b4ecf13811 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 3 May 2017 13:26:14 -0700 Subject: [PATCH 14/41] Added explicit commits to ChaosMonkey tests --- ...yNothingIsSafeWithPassiveReplicasTest.java | 10 ++- ...nkeySafeLeaderWithPassiveReplicasTest.java | 20 ++++-- .../solr/cloud/StoppableCommitThread.java | 69 +++++++++++++++++++ .../solr/cloud/StoppableIndexingThread.java | 2 +- .../solr/cloud/StoppableSearchThread.java | 2 +- 5 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 solr/test-framework/src/java/org/apache/solr/cloud/StoppableCommitThread.java diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index db4539265c2b..71c2924ef4e9 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -36,6 +36,7 @@ import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.core.SolrCore; +import org.apache.solr.util.TestInjection; import org.apache.solr.util.TimeOut; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -43,9 +44,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; + @Slow @SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") -//@ThreadLeakLingering(linger = 60000) +@ThreadLeakLingering(linger = 60000) @SuppressObjectReleaseTracker(bugUrl="Testing purposes") public class ChaosMonkeyNothingIsSafeWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { private static final int FAIL_TOLERANCE = 100; @@ -73,6 +76,7 @@ public static void beforeSuperClass() { @AfterClass public static void afterSuperClass() { System.clearProperty("solr.autoCommit.maxTime"); + TestInjection.waitForReplicasInSync = null; clearErrorHook(); } @@ -167,6 +171,10 @@ public void test() throws Exception { searchThread.start(); } + StoppableCommitThread commitThread = new StoppableCommitThread(cloudClient, 1000, false); + threads.add(commitThread); + commitThread.start(); + // TODO: we only do this sometimes so that we can sometimes compare against control, // it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer boolean runFullThrottle = random().nextBoolean(); diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java index 582da646580a..556a61b79343 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -31,6 +31,7 @@ import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; +import org.apache.solr.util.TestInjection; import org.apache.solr.util.TimeOut; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -64,6 +65,7 @@ protected boolean useAppendReplicas() { public static void beforeSuperClass() { schemaString = "schema15.xml"; // we need a string id System.setProperty("solr.autoCommit.maxTime", "15000"); + TestInjection.waitForReplicasInSync = null; setErrorHook(); } @@ -122,7 +124,7 @@ public void test() throws Exception { tryDelete(); - List threads = new ArrayList<>(); + List threads = new ArrayList<>(); int threadCount = 2; int batchSize = 1; if (random().nextBoolean()) { @@ -143,6 +145,10 @@ public void test() throws Exception { indexThread.start(); } + StoppableCommitThread commitThread = new StoppableCommitThread(cloudClient, 1000, false); + threads.add(commitThread); + commitThread.start(); + chaosMonkey.startTheMonkey(false, 500); try { long runLength; @@ -164,17 +170,19 @@ public void test() throws Exception { chaosMonkey.stopTheMonkey(); } - for (StoppableIndexingThread indexThread : threads) { - indexThread.safeStop(); + for (StoppableThread thread : threads) { + thread.safeStop(); } // wait for stop... - for (StoppableIndexingThread thread : threads) { + for (StoppableThread thread : threads) { thread.join(); } - for (StoppableIndexingThread indexThread : threads) { - assertEquals(0, indexThread.getFailCount()); + for (StoppableThread thread : threads) { + if (thread instanceof StoppableIndexingThread) { + assertEquals(0, ((StoppableIndexingThread)thread).getFailCount()); + } } // try and wait for any replications and what not to finish... diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/StoppableCommitThread.java b/solr/test-framework/src/java/org/apache/solr/cloud/StoppableCommitThread.java new file mode 100644 index 000000000000..f87ebb5a2a95 --- /dev/null +++ b/solr/test-framework/src/java/org/apache/solr/cloud/StoppableCommitThread.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.lang.invoke.MethodHandles; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.cloud.AbstractFullDistribZkTestBase.StoppableThread; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class StoppableCommitThread extends StoppableThread { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final SolrClient cloudClient; + private final long timeBetweenCommitsMs; + private final boolean softCommits; + private volatile boolean stop = false; + private final AtomicInteger numCommits = new AtomicInteger(0); + private final AtomicInteger numFails = new AtomicInteger(0); + + public StoppableCommitThread(SolrClient cloudClient, long timeBetweenCommitsMs, boolean softCommits) { + super("StoppableCommitThread"); + this.cloudClient = cloudClient; + this.timeBetweenCommitsMs = timeBetweenCommitsMs; + this.softCommits = softCommits; + } + + @Override + public void run() { + log.debug("StoppableCommitThread started"); + while (!stop) { + try { + cloudClient.commit(false, false, softCommits); + numCommits.incrementAndGet(); + } catch (Exception e) { + numFails.incrementAndGet(); + } + try { + Thread.sleep(timeBetweenCommitsMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + log.debug("StoppableCommitThread finished. Committed {} times. Failed {} times.", numCommits.get(), numFails.get()); + } + + @Override + public void safeStop() { + this.stop = true; + } + +} diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/StoppableIndexingThread.java b/solr/test-framework/src/java/org/apache/solr/cloud/StoppableIndexingThread.java index 7dd88c1a526d..0385d734dcc0 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/StoppableIndexingThread.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/StoppableIndexingThread.java @@ -132,7 +132,7 @@ public void run() { if (docs.size() > 0 && pauseBetweenUpdates) { try { - Thread.currentThread().sleep(AbstractFullDistribZkTestBase.random().nextInt(500) + 50); + Thread.sleep(AbstractFullDistribZkTestBase.random().nextInt(500) + 50); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/StoppableSearchThread.java b/solr/test-framework/src/java/org/apache/solr/cloud/StoppableSearchThread.java index fa916c654dc1..c579f22def3d 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/StoppableSearchThread.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/StoppableSearchThread.java @@ -47,7 +47,7 @@ public void run() { Random random = LuceneTestCase.random(); int numSearches = 0; - while (true && !stop) { + while (!stop) { numSearches++; try { //to come to the aid of their country. From a1421ea2b9375c80d77e35eb519f01010c059ad7 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Thu, 4 May 2017 17:08:39 -0700 Subject: [PATCH 15/41] Don't run ChaosMonkey tests with connection loss yet --- .../src/java/org/apache/solr/cloud/ChaosMonkey.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index 05a42eea39c3..355e6016307f 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -67,8 +67,9 @@ public class ChaosMonkey { private Map> shardToJetty; private static final Boolean MONKEY_ENABLED = Boolean.valueOf(System.getProperty("solr.tests.cloud.cm.enabled", "true")); - private static final String CONN_LOSS = System.getProperty("solr.tests.cloud.cm.connloss"); - private static final String EXP = System.getProperty("solr.tests.cloud.cm.exp"); + // NOTE: CONN_LOSS and EXP are currently being set to "false" intentionally here. Remove the default value once we know tests pass reliably under those conditions + private static final String CONN_LOSS = System.getProperty("solr.tests.cloud.cm.connloss", "false"); + private static final String EXP = System.getProperty("solr.tests.cloud.cm.exp", "false"); private ZkTestServer zkServer; private ZkStateReader zkStateReader; From 3d49a6ea3cd4a896d6734bccf9a45ff497937761 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Thu, 4 May 2017 17:10:19 -0700 Subject: [PATCH 16/41] RTG requests are forwarded to REALTIME replicas when distrib=true --- .../solr/handler/RealTimeGetHandler.java | 18 +++++-- .../handler/component/HttpShardHandler.java | 29 +++++++++- .../component/RealTimeGetComponent.java | 29 ++++++---- .../apache/solr/cloud/TestAppendReplica.java | 54 +++++++++++++++---- .../apache/solr/cloud/TestPassiveReplica.java | 50 ++++++++++++++--- .../java/org/apache/solr/SolrTestCaseJ4.java | 12 ++++- 6 files changed, 158 insertions(+), 34 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java b/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java index bce374f4aea4..247b65cd1420 100644 --- a/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java @@ -16,14 +16,17 @@ */ package org.apache.solr.handler; -import org.apache.solr.api.Api; -import org.apache.solr.api.ApiBag; -import org.apache.solr.handler.component.*; - import java.util.ArrayList; import java.util.Collection; import java.util.List; +import org.apache.solr.api.Api; +import org.apache.solr.api.ApiBag; +import org.apache.solr.handler.component.RealTimeGetComponent; +import org.apache.solr.handler.component.SearchHandler; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; + public class RealTimeGetHandler extends SearchHandler { @Override @@ -33,6 +36,13 @@ protected List getDefaultComponents() names.add(RealTimeGetComponent.COMPONENT_NAME); return names; } + + + @Override + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { + req.getContext().put("distribOnlyRealtime", Boolean.TRUE); + super.handleRequestBody(req, rsp); + } //////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java index 8c0a9cb04742..f2d263975582 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java @@ -349,9 +349,12 @@ public void prepDistributed(ResponseBuilder rb) { // and make it a non-distributed request. String ourSlice = cloudDescriptor.getShardId(); String ourCollection = cloudDescriptor.getCollectionName(); + // Some requests may only be fulfilled by replicas of type Replica.Type.REALTIME + boolean onlyRealtimeReplicas = Boolean.TRUE == req.getContext().get("distribOnlyRealtime"); if (rb.slices.length == 1 && rb.slices[0] != null && ( rb.slices[0].equals(ourSlice) || rb.slices[0].equals(ourCollection + "_" + ourSlice) ) // handle the _ format - && cloudDescriptor.getLastPublished() == Replica.State.ACTIVE) { + && cloudDescriptor.getLastPublished() == Replica.State.ACTIVE + && (!onlyRealtimeReplicas || cloudDescriptor.getReplicaType() == Replica.Type.REALTIME)) { boolean shortCircuit = params.getBool("shortCircuit", true); // currently just a debugging parameter to check distrib search on a single node String targetHandler = params.get(ShardParams.SHARDS_QT); @@ -387,14 +390,36 @@ public void prepDistributed(ResponseBuilder rb) { continue; // throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "no such shard: " + sliceName); } + Replica shardLeader = null; final Collection allSliceReplicas = slice.getReplicasMap().values(); final List eligibleSliceReplicas = new ArrayList<>(allSliceReplicas.size()); for (Replica replica : allSliceReplicas) { if (!clusterState.liveNodesContain(replica.getNodeName()) - || replica.getState() != Replica.State.ACTIVE) { + || replica.getState() != Replica.State.ACTIVE + || (onlyRealtimeReplicas && replica.getType() == Replica.Type.PASSIVE)) { continue; } + + if (onlyRealtimeReplicas && replica.getType() == Replica.Type.APPEND) { + if (shardLeader == null) { + try { + shardLeader = zkController.getZkStateReader().getLeaderRetry(cloudDescriptor.getCollectionName(), slice.getName()); + } catch (InterruptedException e) { + throw new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + slice.getName() + " in collection " + + cloudDescriptor.getCollectionName(), e); + } catch (SolrException e) { + if (log.isDebugEnabled()) { + log.debug("Exception finding leader for shard {} in collection {}. Collection State: {}", + slice.getName(), cloudDescriptor.getCollectionName(), zkController.getZkStateReader().getClusterState().getCollectionOrNull(cloudDescriptor.getCollectionName())); + } + throw e; + } + } + if (!replica.getName().equals(shardLeader.getName())) { + continue; + } + } eligibleSliceReplicas.add(replica); } diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java index d785868db320..18e202ecfcce 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java @@ -16,6 +16,10 @@ */ package org.apache.solr.handler.component; +import static org.apache.solr.common.params.CommonParams.DISTRIB; +import static org.apache.solr.common.params.CommonParams.ID; +import static org.apache.solr.common.params.CommonParams.VERSION_FIELD; + import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; @@ -24,6 +28,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; @@ -68,9 +73,9 @@ import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocList; -import org.apache.solr.search.SolrDocumentFetcher; import org.apache.solr.search.QParser; import org.apache.solr.search.ReturnFields; +import org.apache.solr.search.SolrDocumentFetcher; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrReturnFields; import org.apache.solr.search.SyntaxError; @@ -82,10 +87,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.solr.common.params.CommonParams.DISTRIB; -import static org.apache.solr.common.params.CommonParams.ID; -import static org.apache.solr.common.params.CommonParams.VERSION_FIELD; - public class RealTimeGetComponent extends SearchComponent { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -105,11 +106,19 @@ public void process(ResponseBuilder rb) throws IOException SolrQueryRequest req = rb.req; SolrQueryResponse rsp = rb.rsp; SolrParams params = req.getParams(); - - if (req.getCore().getCoreDescriptor().getCloudDescriptor() != null - && !req.getCore().getCoreDescriptor().getCloudDescriptor().requiresTransactionLog()) { - //nocommit: forward request to leader - return; + CloudDescriptor cloudDesc = req.getCore().getCoreDescriptor().getCloudDescriptor(); + + if (cloudDesc != null) { + Replica.Type replicaType = cloudDesc.getReplicaType(); + if (replicaType != null) { + if (replicaType == Replica.Type.PASSIVE) { + throw new SolrException(ErrorCode.BAD_REQUEST, + String.format(Locale.ROOT, "%s can't handle realtime get requests. Replicas of type %s do not support these type of requests", + cloudDesc.getCoreNodeName(), + Replica.Type.PASSIVE)); + } + // non-leader APPEND replicas should not respond to distrib /get requests, but internal requests are OK + } } if (!params.getBool(COMPONENT_NAME, true)) { diff --git a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java index a7f30427a793..b56c5a732c99 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java @@ -28,6 +28,7 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import org.apache.http.client.HttpClient; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.client.solrj.SolrClient; @@ -41,6 +42,7 @@ import org.apache.solr.client.solrj.response.CollectionAdminResponse; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.CollectionStatePredicate; @@ -255,16 +257,48 @@ public void testKillLeader() throws Exception { doReplaceLeader(false); } - public void testPassiveReplicaStates() { - // Validate that passive replicas go through the correct states when starting, stopping, reconnecting - } - - public void testPassiveReplicaCantConnectToZooKeeper() { - - } - - public void testRealTimeGet() { - // should be redirected to writers or error + public void testRealTimeGet() throws SolrServerException, IOException, KeeperException, InterruptedException { + // should be redirected to Replica.Type.REALTIME + int numReplicas = random().nextBoolean()?1:2; + int numRealtimeReplicas = random().nextBoolean()?0:2; + CollectionAdminRequest.createCollection(collectionName, "conf", 1, numRealtimeReplicas, numReplicas, 0) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + waitForState("Unexpected replica count", collectionName, activeReplicaCount(numRealtimeReplicas, numReplicas, 0)); + DocCollection docCollection = assertNumberOfReplicas(numRealtimeReplicas, numReplicas, 0, false, true); + HttpClient httpClient = cluster.getSolrClient().getHttpClient(); + int id = 0; + Slice slice = docCollection.getSlice("shard1"); + List ids = new ArrayList<>(slice.getReplicas().size()); + for (Replica rAdd:slice.getReplicas()) { + try (HttpSolrClient client = getHttpSolrClient(rAdd.getCoreUrl(), httpClient)) { + client.add(new SolrInputDocument("id", String.valueOf(id), "foo_s", "bar")); + } + SolrDocument docCloudClient = cluster.getSolrClient().getById(collectionName, String.valueOf(id)); + assertEquals("bar", docCloudClient.getFieldValue("foo_s")); + for (Replica rGet:slice.getReplicas()) { + try (HttpSolrClient client = getHttpSolrClient(rGet.getCoreUrl(), httpClient)) { + SolrDocument doc = client.getById(String.valueOf(id)); + assertEquals("bar", doc.getFieldValue("foo_s")); + } + } + ids.add(String.valueOf(id)); + id++; + } + SolrDocumentList previousAllIdsResult = null; + for (Replica rAdd:slice.getReplicas()) { + try (HttpSolrClient client = getHttpSolrClient(rAdd.getCoreUrl(), httpClient)) { + SolrDocumentList allIdsResult = client.getById(ids); + if (previousAllIdsResult != null) { + assertTrue(compareSolrDocumentList(previousAllIdsResult, allIdsResult)); + } else { + // set the first response here + previousAllIdsResult = allIdsResult; + assertEquals("Unexpected number of documents", ids.size(), allIdsResult.getNumFound()); + } + } + id++; + } } /* diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java index e158a92ee1cb..d087c1966795 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java @@ -28,6 +28,7 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import org.apache.http.client.HttpClient; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; @@ -36,6 +37,8 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.response.CollectionAdminResponse; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.CollectionStatePredicate; @@ -289,8 +292,47 @@ public void testPassiveReplicaStates() throws Exception { assertEquals("Expecting DOWN->RECOVERING->ACTIVE but saw: " + Arrays.toString(statesSeen.toArray()), Replica.State.ACTIVE, statesSeen.get(0)); } - public void testRealTimeGet() { - // should be redirected to writers + public void testRealTimeGet() throws SolrServerException, IOException, KeeperException, InterruptedException { + // should be redirected to Replica.Type.REALTIME + int numReplicas = random().nextBoolean()?1:2; + CollectionAdminRequest.createCollection(collectionName, "conf", 1, numReplicas, 0, numReplicas) + .setMaxShardsPerNode(100) + .process(cluster.getSolrClient()); + waitForState("Unexpected replica count", collectionName, activeReplicaCount(numReplicas, 0, numReplicas)); + DocCollection docCollection = assertNumberOfReplicas(numReplicas, 0, numReplicas, false, true); + HttpClient httpClient = cluster.getSolrClient().getHttpClient(); + int id = 0; + Slice slice = docCollection.getSlice("shard1"); + List ids = new ArrayList<>(slice.getReplicas().size()); + for (Replica rAdd:slice.getReplicas()) { + try (HttpSolrClient client = getHttpSolrClient(rAdd.getCoreUrl(), httpClient)) { + client.add(new SolrInputDocument("id", String.valueOf(id), "foo_s", "bar")); + } + SolrDocument docCloudClient = cluster.getSolrClient().getById(collectionName, String.valueOf(id)); + assertEquals("bar", docCloudClient.getFieldValue("foo_s")); + for (Replica rGet:slice.getReplicas()) { + try (HttpSolrClient client = getHttpSolrClient(rGet.getCoreUrl(), httpClient)) { + SolrDocument doc = client.getById(String.valueOf(id)); + assertEquals("bar", doc.getFieldValue("foo_s")); + } + } + ids.add(String.valueOf(id)); + id++; + } + SolrDocumentList previousAllIdsResult = null; + for (Replica rAdd:slice.getReplicas()) { + try (HttpSolrClient client = getHttpSolrClient(rAdd.getCoreUrl(), httpClient)) { + SolrDocumentList allIdsResult = client.getById(ids); + if (previousAllIdsResult != null) { + assertTrue(compareSolrDocumentList(previousAllIdsResult, allIdsResult)); + } else { + // set the first response here + previousAllIdsResult = allIdsResult; + assertEquals("Unexpected number of documents", ids.size(), allIdsResult.getNumFound()); + } + } + id++; + } } /* @@ -418,10 +460,6 @@ public void testKillPassiveReplica() throws Exception { waitForNumDocsInAllActiveReplicas(2); } - public void testAddDocsToPassive() { - - } - public void testSearchWhileReplicationHappens() { } diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java index 7e443bb87534..0489ab0a5622 100644 --- a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java @@ -2121,9 +2121,17 @@ public boolean compareSolrDocumentList(Object expected, Object actual) { SolrDocumentList list1 = (SolrDocumentList) expected; SolrDocumentList list2 = (SolrDocumentList) actual; - if(Float.compare(list1.getMaxScore(), list2.getMaxScore()) != 0 || list1.getNumFound() != list2.getNumFound() || - list1.getStart() != list2.getStart()) { + if (list1.getMaxScore() == null) { + if (list2.getMaxScore() != null) { + return false; + } + } else if (list2.getMaxScore() == null) { return false; + } else { + if (Float.compare(list1.getMaxScore(), list2.getMaxScore()) != 0 || list1.getNumFound() != list2.getNumFound() || + list1.getStart() != list2.getStart()) { + return false; + } } for(int i=0; i Date: Thu, 4 May 2017 17:13:05 -0700 Subject: [PATCH 17/41] Improvements and fixes to existing tests --- .../org/apache/solr/cloud/CloudDescriptor.java | 10 ++++++++-- ...nkeyNothingIsSafeWithPassiveReplicasTest.java | 10 ++++++---- .../solr/cloud/CollectionsAPISolrJTest.java | 4 ++-- .../org/apache/solr/cloud/HttpPartitionTest.java | 4 ++-- .../LeaderInitiatedRecoveryOnCommitTest.java | 4 ++-- .../solr/cloud/RecoveryAfterSoftCommitTest.java | 4 ++-- .../apache/solr/cloud/ReplicationFactorTest.java | 4 ++-- .../cloud/AbstractFullDistribZkTestBase.java | 16 +++++++++++++--- 8 files changed, 37 insertions(+), 19 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java index 1f387ad9b4ce..8a37be58d551 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java +++ b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java @@ -20,12 +20,13 @@ import java.util.Map; import java.util.Properties; -import com.google.common.base.Strings; import org.apache.solr.common.StringUtils; import org.apache.solr.common.cloud.Replica; import org.apache.solr.core.CoreDescriptor; import org.apache.solr.util.PropertiesUtil; +import com.google.common.base.Strings; + public class CloudDescriptor { private final CoreDescriptor cd; @@ -64,7 +65,12 @@ public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { if (Strings.isNullOrEmpty(nodeName)) this.nodeName = null; this.numShards = PropertiesUtil.toInteger(props.getProperty(CloudDescriptor.NUM_SHARDS), null); - this.replicaType = Replica.Type.valueOf(props.getProperty(CloudDescriptor.REPLICA_TYPE, Replica.Type.REALTIME.toString())); + String replicaTypeStr = props.getProperty(CloudDescriptor.REPLICA_TYPE); + if (Strings.isNullOrEmpty(replicaTypeStr)) { + this.replicaType = Replica.Type.REALTIME; + } else { + this.replicaType = Replica.Type.valueOf(replicaTypeStr); + } for (String propName : props.stringPropertyNames()) { if (propName.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) { collectionParams.put(propName.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), props.getProperty(propName)); diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index 71c2924ef4e9..ce35529660a9 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -70,13 +70,13 @@ protected int getPassiveReplicaCount() { public static void beforeSuperClass() { schemaString = "schema15.xml"; // we need a string id System.setProperty("solr.autoCommit.maxTime", "15000"); + TestInjection.waitForReplicasInSync = null; setErrorHook(); } @AfterClass public static void afterSuperClass() { System.clearProperty("solr.autoCommit.maxTime"); - TestInjection.waitForReplicasInSync = null; clearErrorHook(); } @@ -171,9 +171,11 @@ public void test() throws Exception { searchThread.start(); } - StoppableCommitThread commitThread = new StoppableCommitThread(cloudClient, 1000, false); - threads.add(commitThread); - commitThread.start(); + if (usually()) { + StoppableCommitThread commitThread = new StoppableCommitThread(cloudClient, 1000, false); + threads.add(commitThread); + commitThread.start(); + } // TODO: we only do this sometimes so that we can sometimes compare against control, // it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer diff --git a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java index e2a80b69741d..3ca3f0fc0096 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java @@ -148,8 +148,8 @@ public void testSplitShard() throws Exception { assertEquals(0, response.getStatus()); assertTrue(response.isSuccess()); Map> coresStatus = response.getCollectionCoresStatus(); - assertEquals(0, (int) coresStatus.get(collectionName + "_shard1_0_replica1").get("status")); - assertEquals(0, (int) coresStatus.get(collectionName + "_shard1_1_replica1").get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shard1_0" , Replica.Type.REALTIME, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shard1_1" , Replica.Type.REALTIME, 1)).get("status")); waitForState("Expected all shards to be active and parent shard to be removed", collectionName, (n, c) -> { if (c.getSlice("shard1").getState() == Slice.State.ACTIVE) diff --git a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java index d0b0c5eaa83b..840679148871 100644 --- a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java @@ -110,10 +110,10 @@ protected CloudSolrClient createCloudClient(String defaultCollection) { */ @Override public JettySolrRunner createJetty(File solrHome, String dataDir, - String shardList, String solrConfigOverride, String schemaOverride) + String shardList, String solrConfigOverride, String schemaOverride, Replica.Type replicaType) throws Exception { - return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride); + return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride, replicaType); } @Test diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java index fd1b40343fd5..199091952d0e 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java @@ -161,9 +161,9 @@ private void oneShardTest() throws Exception { */ @Override public JettySolrRunner createJetty(File solrHome, String dataDir, - String shardList, String solrConfigOverride, String schemaOverride) + String shardList, String solrConfigOverride, String schemaOverride, Replica.Type replicaType) throws Exception { - return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride); + return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride, replicaType); } protected void sendCommitWithRetry(Replica replica) throws Exception { diff --git a/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java b/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java index 1da9aca086ea..eabd9b03c493 100644 --- a/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java @@ -64,10 +64,10 @@ public static void afterTest() { */ @Override public JettySolrRunner createJetty(File solrHome, String dataDir, - String shardList, String solrConfigOverride, String schemaOverride) + String shardList, String solrConfigOverride, String schemaOverride, Replica.Type replicaType) throws Exception { - return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride); + return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride, replicaType); } @Test diff --git a/solr/core/src/test/org/apache/solr/cloud/ReplicationFactorTest.java b/solr/core/src/test/org/apache/solr/cloud/ReplicationFactorTest.java index 9100eee67f4b..abd394a9df69 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ReplicationFactorTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ReplicationFactorTest.java @@ -64,10 +64,10 @@ public ReplicationFactorTest() { */ @Override public JettySolrRunner createJetty(File solrHome, String dataDir, - String shardList, String solrConfigOverride, String schemaOverride) + String shardList, String solrConfigOverride, String schemaOverride, Replica.Type replicaType) throws Exception { - return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride); + return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride, replicaType); } @Test diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java index 4d362866ef6f..2d082ea48fe4 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java @@ -236,6 +236,11 @@ public void distribSetUp() throws Exception { CreateMode.PERSISTENT, true); } } + if (useAppendReplicas()) { + log.info("Will use {} replicas unless explicitly asked otherwise", Replica.Type.APPEND); + } else { + log.info("Will use {} replicas unless explicitly asked otherwise", Replica.Type.REALTIME); + } } @BeforeClass @@ -512,7 +517,7 @@ public JettySolrRunner createJetty(String dataDir, String ulogDir, String shardL return jetty; } - public JettySolrRunner createJetty(File solrHome, String dataDir, String shardList, String solrConfigOverride, String schemaOverride) throws Exception { + public final JettySolrRunner createJetty(File solrHome, String dataDir, String shardList, String solrConfigOverride, String schemaOverride) throws Exception { return createJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride, null); } @@ -541,7 +546,7 @@ public JettySolrRunner createJetty(File solrHome, String dataDir, String shardLi props.setProperty("solr.data.dir", getDataDir(dataDir)); if (replicaType != null) { props.setProperty("replicaType", replicaType.toString()); - } else { // TODO: include the case with no replicaTYpe defined: if (random().nextBoolean()) { + } else if (random().nextBoolean()) { props.setProperty("replicaType", Replica.Type.REALTIME.toString()); } props.setProperty("coreRootDirectory", solrHome.toPath().resolve("cores").toAbsolutePath().toString()); @@ -558,7 +563,7 @@ public JettySolrRunner createJetty(File solrHome, String dataDir, String shardLi * with IPTables. */ public JettySolrRunner createProxiedJetty(File solrHome, String dataDir, - String shardList, String solrConfigOverride, String schemaOverride) + String shardList, String solrConfigOverride, String schemaOverride, Replica.Type replicaType) throws Exception { JettyConfig jettyconfig = JettyConfig.builder() @@ -578,6 +583,11 @@ public JettySolrRunner createProxiedJetty(File solrHome, String dataDir, props.setProperty("shards", shardList); if (dataDir != null) props.setProperty("solr.data.dir", getDataDir(dataDir)); + if (replicaType != null) { + props.setProperty("replicaType", replicaType.toString()); + } else if (random().nextBoolean()) { + props.setProperty("replicaType", Replica.Type.REALTIME.toString()); + } props.setProperty("coreRootDirectory", solrHome.toPath().resolve("cores").toAbsolutePath().toString()); JettySolrRunner jetty = new JettySolrRunner(solrHome.getPath(), props, jettyconfig); From b16f0e3531bce9ca600bb53d4bea908c269ce0d1 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Mon, 8 May 2017 15:06:26 -0700 Subject: [PATCH 18/41] Temporary commit for SOLR-10524 --- .../src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java | 3 +++ .../test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java | 2 ++ 2 files changed, 5 insertions(+) diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java index c7e8c523961a..1eddeaa7579f 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java @@ -166,6 +166,9 @@ public ClusterState enqueueUpdate(ClusterState prevState, ZkWriteCommand cmd, Zk * @return true if a flush is required, false otherwise */ protected boolean maybeFlushBefore(ZkWriteCommand cmd) { + if (cmd.collection == null) { + return false; + } return cmd.collection.getStateFormat() != lastStateFormat; } diff --git a/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java b/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java index 85dbf4aba720..729857f17eff 100644 --- a/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java @@ -35,6 +35,7 @@ import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.util.Utils; import org.apache.zookeeper.KeeperException; +import org.junit.Ignore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,6 +43,7 @@ public class ZkStateWriterTest extends SolrTestCaseJ4 { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + @Ignore public void testZkStateWriterBatching() throws Exception { String zkDir = createTempDir("testZkStateWriterBatching").toFile().getAbsolutePath(); From d78946daf122aeb740edc0182ca0fa74d146c3d6 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 9 May 2017 13:03:30 -0700 Subject: [PATCH 19/41] Added support for replica types in create shard operation --- .../org/apache/solr/cloud/CreateShardCmd.java | 48 +++++++++++++++---- .../solr/cloud/CollectionsAPISolrJTest.java | 12 +++-- .../solrj/request/CollectionAdminRequest.java | 23 ++++++--- .../solr/common/cloud/ZkStateReader.java | 4 -- 4 files changed, 64 insertions(+), 23 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java index 52df32b692f2..f38009cfe289 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java @@ -28,6 +28,7 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CoreAdminParams; @@ -41,7 +42,10 @@ import static org.apache.solr.cloud.Assign.getNodesForNewReplicas; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_CONF; +import static org.apache.solr.common.cloud.ZkStateReader.APPEND_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.PASSIVE_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; import static org.apache.solr.common.params.CommonAdminParams.ASYNC; @@ -67,9 +71,18 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(); DocCollection collection = clusterState.getCollection(collectionName); - int repFactor = message.getInt(REPLICATION_FACTOR, collection.getInt(REPLICATION_FACTOR, 1)); +// int repFactor = message.getInt(REPLICATION_FACTOR, collection.getInt(REPLICATION_FACTOR, 1)); + int numRealtimeReplicas = message.getInt(REALTIME_REPLICAS, message.getInt(REPLICATION_FACTOR, collection.getInt(REALTIME_REPLICAS, collection.getInt(REPLICATION_FACTOR, 1)))); + int numPassiveReplicas = message.getInt(PASSIVE_REPLICAS, collection.getInt(PASSIVE_REPLICAS, 0)); + int numAppendReplicas = message.getInt(APPEND_REPLICAS, collection.getInt(APPEND_REPLICAS, 0)); + int totalReplicas = numRealtimeReplicas + numPassiveReplicas + numAppendReplicas; + + if (numRealtimeReplicas + numAppendReplicas <= 0) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, REALTIME_REPLICAS + " + " + APPEND_REPLICAS + " must be greater than 0"); + } + Object createNodeSetStr = message.get(OverseerCollectionMessageHandler.CREATE_NODE_SET); - List sortedNodeList = getNodesForNewReplicas(clusterState, collectionName, sliceName, repFactor, + List sortedNodeList = getNodesForNewReplicas(clusterState, collectionName, sliceName, totalReplicas, createNodeSetStr, ocmh.overseer.getZkController().getCoreContainer()); ZkStateReader zkStateReader = ocmh.zkStateReader; @@ -90,19 +103,38 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul String async = message.getStr(ASYNC); Map requestMap = null; if (async != null) { - requestMap = new HashMap<>(repFactor, 1.0f); + requestMap = new HashMap<>(totalReplicas, 1.0f); } - - for (int j = 1; j <= repFactor; j++) { + + int createdRealtimeReplicas = 0, createdAppendReplicas = 0, createdPassiveReplicas = 0; + + for (int j = 1; j <= totalReplicas; j++) { + int coreNameNumber; + Replica.Type typeToCreate; + if (createdRealtimeReplicas < numRealtimeReplicas) { + createdRealtimeReplicas++; + coreNameNumber = createdRealtimeReplicas; + typeToCreate = Replica.Type.REALTIME; + } else if (createdAppendReplicas < numAppendReplicas) { + createdAppendReplicas++; + coreNameNumber = createdAppendReplicas; + typeToCreate = Replica.Type.APPEND; + } else { + createdPassiveReplicas++; + coreNameNumber = createdPassiveReplicas; + typeToCreate = Replica.Type.PASSIVE; + } String nodeName = sortedNodeList.get(((j - 1)) % sortedNodeList.size()).nodeName; - String shardName = collectionName + "_" + sliceName + "_replica" + j; - log.info("Creating shard " + shardName + " as part of slice " + sliceName + " of collection " + collectionName + String coreName = Assign.buildCoreName(collectionName, sliceName, typeToCreate, coreNameNumber); +// String coreName = collectionName + "_" + sliceName + "_replica" + j; + log.info("Creating replica " + coreName + " as part of slice " + sliceName + " of collection " + collectionName + " on " + nodeName); // Need to create new params for each request ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.CREATE.toString()); - params.set(CoreAdminParams.NAME, shardName); + params.set(CoreAdminParams.NAME, coreName); + params.set(CoreAdminParams.REPLICA_TYPE, typeToCreate.name()); params.set(COLL_CONF, configName); params.set(CoreAdminParams.COLLECTION, collectionName); params.set(CoreAdminParams.SHARD, sliceName); diff --git a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java index 93f61b509c72..2a2da781abe0 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java @@ -117,17 +117,17 @@ public void testCloudInfoInCoreStatus() throws IOException, SolrServerException @Test public void testCreateAndDeleteShard() throws IOException, SolrServerException { - // Create an implicit collection String collectionName = "solrj_implicit"; CollectionAdminResponse response - = CollectionAdminRequest.createCollectionWithImplicitRouter(collectionName, "conf", "shardA,shardB", 1) + = CollectionAdminRequest.createCollectionWithImplicitRouter(collectionName, "conf", "shardA,shardB", 1, 1, 1) + .setMaxShardsPerNode(3) .process(cluster.getSolrClient()); assertEquals(0, response.getStatus()); assertTrue(response.isSuccess()); Map> coresStatus = response.getCollectionCoresStatus(); - assertEquals(2, coresStatus.size()); + assertEquals(6, coresStatus.size()); // Add a shard to the implicit collection response = CollectionAdminRequest.createShard(collectionName, "shardC").process(cluster.getSolrClient()); @@ -135,8 +135,10 @@ public void testCreateAndDeleteShard() throws IOException, SolrServerException { assertEquals(0, response.getStatus()); assertTrue(response.isSuccess()); coresStatus = response.getCollectionCoresStatus(); - assertEquals(1, coresStatus.size()); - assertEquals(0, (int) coresStatus.get(collectionName + "_shardC_replica1").get("status")); + assertEquals(3, coresStatus.size()); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.REALTIME, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.APPEND, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.PASSIVE, 1)).get("status")); response = CollectionAdminRequest.deleteShard(collectionName, "shardC").process(cluster.getSolrClient()); diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java index b248f9253b95..10b5c6969942 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java @@ -365,6 +365,22 @@ public static Create createCollection(String collection, int numShards, int numR public static Create createCollectionWithImplicitRouter(String collection, String config, String shards, int numReplicas) { return new Create(collection, config, shards, numReplicas); } + + /** + * Returns a SolrRequest for creating a collection with the implicit router and specific types of replicas + * @param collection the collection name + * @param config the collection config + * @param shards a shard definition string + * @param numRealtimeReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#REALTIME} + * @param numAppendReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#APPEND} + * @param numPassiveReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} + */ + public static Create createCollectionWithImplicitRouter(String collection, String config, String shards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) { + Create createRequest = new Create(collection, config, shards, numRealtimeReplicas); + createRequest.appendReplicas = numAppendReplicas; + createRequest.passiveReplicas = numPassiveReplicas; + return createRequest; + } // CREATE request public static class Create extends AsyncCollectionSpecificAdminRequest { @@ -422,7 +438,6 @@ private Create(String collection, String config, String shards, int numRealtimeR public Create setRealtimeReplicas(Integer realtimeReplicas) { this.realtimeReplicas = realtimeReplicas; return this;} public Create setAppendReplicas(Integer appendReplicas) { this.appendReplicas = appendReplicas; return this;} - @Deprecated public Create setReplicationFactor(Integer repl) { this.realtimeReplicas = repl; return this; } public Create setStateFormat(Integer stateFormat) { this.stateFormat = stateFormat; return this; } public Create setRule(String... s){ this.rule = s; return this; } @@ -434,11 +449,7 @@ private Create(String collection, String config, String shards, int numRealtimeR public String getShards() { return shards; } public Integer getNumShards() { return numShards; } public Integer getMaxShardsPerNode() { return maxShardsPerNode; } - /** - * - * @deprecated Use {@link #getNumRealtimeReplicas()} - */ - @Deprecated + public Integer getReplicationFactor() { return getNumRealtimeReplicas(); } public Integer getNumRealtimeReplicas() { return realtimeReplicas; } public Boolean getAutoAddReplicas() { return autoAddReplicas; } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index c0a6caf6c8f2..3f6f1accbb09 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -94,10 +94,6 @@ public class ZkStateReader implements Closeable { public static final String REJOIN_AT_HEAD_PROP = "rejoinAtHead"; public static final String SOLR_SECURITY_CONF_PATH = "/security.json"; - /** - *@deprecated Use {@link #REALTIME_REPLICAS} - */ - @Deprecated public static final String REPLICATION_FACTOR = "replicationFactor"; public static final String MAX_SHARDS_PER_NODE = "maxShardsPerNode"; public static final String AUTO_ADD_REPLICAS = "autoAddReplicas"; From 507548a94e1f8e95e76f3a1d6a41a1d5b29210e5 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 9 May 2017 13:04:59 -0700 Subject: [PATCH 20/41] Minor improvements to recovery code --- .../src/java/org/apache/solr/cloud/RecoveryStrategy.java | 6 +----- solr/core/src/java/org/apache/solr/cloud/ZkController.java | 3 +++ .../java/org/apache/solr/handler/ReplicationHandler.java | 4 ++++ .../java/org/apache/solr/handler/admin/PrepRecoveryOp.java | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index 53b6f3e2b57f..3c8c1f4f99db 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -330,6 +330,7 @@ final private void doReplicateOnlyRecovery(SolrCore core) throws InterruptedExce throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader."); } if (cloudDesc.isLeader()) { + assert cloudDesc.getReplicaType() != Replica.Type.PASSIVE; // we are now the leader - no one else must have been suitable LOG.warn("We have not yet recovered - but we are now the leader!"); LOG.info("Finished recovery process."); @@ -342,11 +343,6 @@ final private void doReplicateOnlyRecovery(SolrCore core) throws InterruptedExce ourUrl); zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING); - if (isClosed()) { - LOG.info("Recovery for core {} has been closed", core.getName()); - break; - } - if (isClosed()) { LOG.info("Recovery for core {} has been closed", core.getName()); break; diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 5fdd92cae6ae..11fff1f09f0f 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -892,6 +892,9 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov if (replica == null || replica.getType() != Type.PASSIVE) { joinElection(desc, afterExpiration, joinAtHead); } else if (replica.getType() == Type.PASSIVE) { + if (joinAtHead) { + log.warn("Replica {} was designated as preferred leader but it's type is {}, It won't join election", coreZkNodeName, Type.PASSIVE); + } log.debug("Replica {} skipping election because it's type is {}", coreZkNodeName, Type.PASSIVE); startReplicationFromLeader(coreName, false); } diff --git a/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java b/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java index 94ff1893928b..90d33bee0dc8 100644 --- a/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java @@ -403,6 +403,10 @@ public IndexFetchResult doFetch(SolrParams solrParams, boolean forceReplication) String masterUrl = solrParams == null ? null : solrParams.get(MASTER_URL); if (!indexFetchLock.tryLock()) return IndexFetchResult.LOCK_OBTAIN_FAILED; + if (core.getCoreContainer().isShutDown()) { + LOG.warn("I was asked to replicate but CoreContainer is shutting down"); + return IndexFetchResult.LOCK_OBTAIN_FAILED;//nocommit: different + } try { if (masterUrl != null) { if (currentIndexFetcher != null && currentIndexFetcher != pollingIndexFetcher) { diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java index 8e5408b7a963..39892b0e6b66 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java @@ -186,7 +186,7 @@ public void execute(CallInfo it) throws Exception { } // solrcloud_debug - if (log.isDebugEnabled()) { + if (log.isDebugEnabled() && core != null) { try { LocalSolrQueryRequest r = new LocalSolrQueryRequest(core, new ModifiableSolrParams()); From d9021e9e542e6ec4454720eb3c2b3da881987bc8 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 9 May 2017 13:05:50 -0700 Subject: [PATCH 21/41] Added test coverage for replica types with Collections API --- .../apache/solr/cloud/TestCollectionAPI.java | 7 ++- .../apache/solr/cloud/TestPassiveReplica.java | 43 ++++++++++++------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java b/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java index dd55f23c30fd..68509b86cad3 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java @@ -60,7 +60,12 @@ public TestCollectionAPI() { @ShardsFixed(num = 2) public void test() throws Exception { try (CloudSolrClient client = createCloudClient(null)) { - CollectionAdminRequest.Create req = CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf1",2,2); + CollectionAdminRequest.Create req; + if (useAppendReplicas()) { + req = CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf1",2, 0, 1, 1); + } else { + req = CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf1",2, 1, 0, 1); + } req.setMaxShardsPerNode(2); client.request(req); createCollection(null, COLLECTION_NAME1, 1, 1, 1, client, null, "conf1"); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java index d087c1966795..e1f597bab916 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java @@ -63,7 +63,6 @@ public class TestPassiveReplica extends SolrCloudTestCase { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - // TODO: Make sure that FORCELEADER can't be used with Passive // TODO: Backup/Snapshot should not work on passive replicas // TODO: ADDSHARD operation @@ -139,22 +138,34 @@ public void testCreateDelete() throws Exception { CollectionAdminRequest.createCollection(collectionName, "conf", 2, 1, 0, 3) .setMaxShardsPerNode(100) .process(cluster.getSolrClient()); - DocCollection docCollection = getCollectionState(collectionName); - assertNotNull(docCollection); - assertEquals("Expecting 4 relpicas per shard", - 8, docCollection.getReplicas().size()); - assertEquals("Expecting 6 passive replicas, 3 per shard", - 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); - assertEquals("Expecting 2 writer replicas, one per shard", - 2, docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); - for (Slice s:docCollection.getSlices()) { - // read-only replicas can never become leaders - assertFalse(s.getLeader().getType() == Replica.Type.PASSIVE); - List shardElectionNodes = cluster.getZkClient().getChildren(ZkStateReader.getShardLeadersElectPath(collectionName, s.getName()), null, true); - assertEquals("Unexpected election nodes for Shard: " + s.getName() + ": " + Arrays.toString(shardElectionNodes.toArray()), - 1, shardElectionNodes.size()); + boolean reloaded = false; + while (true) { + DocCollection docCollection = getCollectionState(collectionName); + assertNotNull(docCollection); + assertEquals("Expecting 4 relpicas per shard", + 8, docCollection.getReplicas().size()); + assertEquals("Expecting 6 passive replicas, 3 per shard", + 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); + assertEquals("Expecting 2 writer replicas, one per shard", + 2, docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); + for (Slice s:docCollection.getSlices()) { + // read-only replicas can never become leaders + assertFalse(s.getLeader().getType() == Replica.Type.PASSIVE); + List shardElectionNodes = cluster.getZkClient().getChildren(ZkStateReader.getShardLeadersElectPath(collectionName, s.getName()), null, true); + assertEquals("Unexpected election nodes for Shard: " + s.getName() + ": " + Arrays.toString(shardElectionNodes.toArray()), + 1, shardElectionNodes.size()); + } + assertUlogPresence(docCollection); + if (reloaded) { + break; + } else { + // reload + CollectionAdminResponse response = CollectionAdminRequest.reloadCollection(collectionName) + .process(cluster.getSolrClient()); + assertEquals(0, response.getStatus()); + reloaded = true; + } } - assertUlogPresence(docCollection); } finally { zkClient().printLayoutToStdOut(); } From 6271229493a0f5adac675ee0f0d2c4b2c148cef5 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 10 May 2017 12:57:40 -0700 Subject: [PATCH 22/41] Backup/Restore with replica types --- .../org/apache/solr/cloud/RestoreCmd.java | 52 +++++++++++++++++-- .../AbstractCloudBackupRestoreTestCase.java | 23 ++++---- .../solr/common/cloud/DocCollection.java | 24 +++++++++ 3 files changed, 85 insertions(+), 14 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java b/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java index 4e7fb581b347..367ee8671479 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java @@ -33,6 +33,7 @@ import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.ImplicitDocRouter; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; @@ -56,6 +57,7 @@ import static org.apache.solr.cloud.OverseerCollectionMessageHandler.SHARDS_PROP; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_TYPE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD; import static org.apache.solr.common.params.CommonAdminParams.ASYNC; @@ -178,6 +180,12 @@ public void call(ClusterState state, ZkNodeProps message, NamedList results) thr propMap.put(Overseer.QUEUE_OPERATION, CREATESHARD); propMap.put(COLLECTION_PROP, restoreCollectionName); propMap.put(SHARD_ID_PROP, slice.getName()); + + if (restoreCollection.getNumRealtimeReplicas() != null && restoreCollection.getNumRealtimeReplicas() >= 1) { + propMap.put(REPLICA_TYPE, Replica.Type.REALTIME.name()); + } else if (restoreCollection.getNumAppendReplicas() != null && restoreCollection.getNumAppendReplicas() >= 1) { + propMap.put(REPLICA_TYPE, Replica.Type.APPEND.name()); + } // add async param if (asyncId != null) { propMap.put(ASYNC, asyncId); @@ -216,17 +224,51 @@ public void call(ClusterState state, ZkNodeProps message, NamedList results) thr //refresh the location copy of collection state restoreCollection = zkStateReader.getClusterState().getCollection(restoreCollectionName); - //Add the remaining replicas for each shard - Integer numReplicas = restoreCollection.getReplicationFactor(); - if (numReplicas != null && numReplicas > 1) { + //Add the remaining replicas for each shard, considering it's type + int numRealtimeReplicas = restoreCollection.getNumRealtimeReplicas() != null? + restoreCollection.getNumRealtimeReplicas():0; + if (numRealtimeReplicas == 0) { + numRealtimeReplicas = restoreCollection.getReplicationFactor() != null? + restoreCollection.getReplicationFactor():0; + } + int numAppendReplicas = restoreCollection.getNumAppendReplicas() != null? + restoreCollection.getNumAppendReplicas():0; + int numPassiveReplicas = restoreCollection.getNumPassiveReplicas() != null? + restoreCollection.getNumPassiveReplicas():0; + + int createdRealtimeReplicas = 0, createdAppendReplicas = 0, createdPassiveReplicas = 0; + + // We already created either a REALTIME or an APPEND replica as leader + if (numRealtimeReplicas > 0) { + createdRealtimeReplicas++; + } else if (createdAppendReplicas > 0) { + createdAppendReplicas++; + } + + int totalReplicasPerShard = numRealtimeReplicas + numAppendReplicas + numPassiveReplicas; + + if (totalReplicasPerShard > 1) { log.info("Adding replicas to restored collection={}", restoreCollection); for (Slice slice : restoreCollection.getSlices()) { - for (int i = 1; i < numReplicas; i++) { - log.debug("Adding replica for shard={} collection={} ", slice.getName(), restoreCollection); + for (int i = 1; i < totalReplicasPerShard; i++) { + Replica.Type typeToCreate; + if (createdRealtimeReplicas < numRealtimeReplicas) { + createdRealtimeReplicas++; + typeToCreate = Replica.Type.REALTIME; + } else if (createdAppendReplicas < numAppendReplicas) { + createdAppendReplicas++; + typeToCreate = Replica.Type.APPEND; + } else { + createdPassiveReplicas++; + typeToCreate = Replica.Type.PASSIVE; + } + + log.debug("Adding replica for shard={} collection={} of type {} ", slice.getName(), restoreCollection, typeToCreate); HashMap propMap = new HashMap<>(); propMap.put(COLLECTION_PROP, restoreCollectionName); propMap.put(SHARD_ID_PROP, slice.getName()); + propMap.put(REPLICA_TYPE, typeToCreate.name()); // add async param if (asyncId != null) { propMap.put(ASYNC, asyncId); diff --git a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java index f39cfed48d8f..e9cebc9eb4da 100644 --- a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java @@ -29,6 +29,7 @@ import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.CollectionAdminRequest.ClusterProp; import org.apache.solr.client.solrj.response.RequestStatusState; @@ -45,8 +46,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.solr.common.params.ShardParams._ROUTE_; - /** * This class implements the logic required to test Solr cloud backup/restore capability. */ @@ -81,11 +80,15 @@ public static void createCluster() throws Exception { @Test public void test() throws Exception { boolean isImplicit = random().nextBoolean(); + boolean doSplitShardOperation = !isImplicit && random().nextBoolean(); int replFactor = TestUtil.nextInt(random(), 1, 2); + // Split Shard not supported with replica types + int numAppendReplicas = TestUtil.nextInt(random(), 0, 1); + int numPassiveReplicas = TestUtil.nextInt(random(), 0, 1); CollectionAdminRequest.Create create = - CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor); - if (NUM_SHARDS * replFactor > cluster.getJettySolrRunners().size() || random().nextBoolean()) { - create.setMaxShardsPerNode(NUM_SHARDS);//just to assert it survives the restoration + CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor, numAppendReplicas, numPassiveReplicas); + if (NUM_SHARDS * (replFactor + numAppendReplicas + numPassiveReplicas) > cluster.getJettySolrRunners().size() || random().nextBoolean()) { + create.setMaxShardsPerNode((int)Math.ceil(NUM_SHARDS * (replFactor + numAppendReplicas + numPassiveReplicas) / cluster.getJettySolrRunners().size()));//just to assert it survives the restoration } if (random().nextBoolean()) { create.setAutoAddReplicas(true);//just to assert it survives the restoration @@ -109,7 +112,7 @@ public void test() throws Exception { indexDocs(getCollectionName()); - if (!isImplicit && random().nextBoolean()) { + if (doSplitShardOperation) { // shard split the first shard int prevActiveSliceCount = getActiveSliceCount(getCollectionName()); CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(getCollectionName()); @@ -277,9 +280,11 @@ private Map getShardToDocCountMap(CloudSolrClient client, DocCo Map shardToDocCount = new TreeMap<>(); for (Slice slice : docCollection.getActiveSlices()) { String shardName = slice.getName(); - long docsInShard = client.query(docCollection.getName(), new SolrQuery("*:*").setParam(_ROUTE_, shardName)) - .getResults().getNumFound(); - shardToDocCount.put(shardName, (int) docsInShard); + try (HttpSolrClient leaderClient = new HttpSolrClient.Builder(slice.getLeader().getCoreUrl()).withHttpClient(client.getHttpClient()).build()) { + long docsInShard = leaderClient.query(new SolrQuery("*:*").setParam("distrib", "false")) + .getResults().getNumFound(); + shardToDocCount.put(shardName, (int) docsInShard); + } } return shardToDocCount; } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java b/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java index 3b409b7f72fa..3b6ac803776c 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java @@ -35,6 +35,9 @@ import static org.apache.solr.common.cloud.ZkStateReader.AUTO_ADD_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE; import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; +import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.APPEND_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.PASSIVE_REPLICAS; /** * Models a Collection in zookeeper (but that Java name is obviously taken, hence "DocCollection") @@ -58,6 +61,9 @@ public class DocCollection extends ZkNodeProps implements Iterable { private final String znode; private final Integer replicationFactor; + private final Integer numRealtimeReplicas; + private final Integer numAppendReplicas; + private final Integer numPassiveReplicas; private final Integer maxShardsPerNode; private final Boolean autoAddReplicas; @@ -81,6 +87,9 @@ public DocCollection(String name, Map slices, Map this.nodeNameLeaderReplicas = new HashMap<>(); this.nodeNameReplicas = new HashMap<>(); this.replicationFactor = (Integer) verifyProp(props, REPLICATION_FACTOR); + this.numRealtimeReplicas = (Integer) verifyProp(props, REALTIME_REPLICAS); + this.numAppendReplicas = (Integer) verifyProp(props, APPEND_REPLICAS); + this.numPassiveReplicas = (Integer) verifyProp(props, PASSIVE_REPLICAS); this.maxShardsPerNode = (Integer) verifyProp(props, MAX_SHARDS_PER_NODE); Boolean autoAddReplicas = (Boolean) verifyProp(props, AUTO_ADD_REPLICAS); this.autoAddReplicas = autoAddReplicas == null ? Boolean.FALSE : autoAddReplicas; @@ -127,6 +136,9 @@ public static Object verifyProp(Map props, String propName) { switch (propName) { case MAX_SHARDS_PER_NODE: case REPLICATION_FACTOR: + case REALTIME_REPLICAS: + case PASSIVE_REPLICAS: + case APPEND_REPLICAS: return Integer.parseInt(o.toString()); case AUTO_ADD_REPLICAS: return Boolean.parseBoolean(o.toString()); @@ -330,4 +342,16 @@ public boolean equals(Object that) { return super.equals(that) && Objects.equals(this.znode, other.znode) && this.znodeVersion == other.znodeVersion; } + public Integer getNumRealtimeReplicas() { + return numRealtimeReplicas; + } + + public Integer getNumAppendReplicas() { + return numAppendReplicas; + } + + public Integer getNumPassiveReplicas() { + return numPassiveReplicas; + } + } From 1c53272e947801bdc1f54ab9ef89fe1e98edf432 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 10 May 2017 13:17:02 -0700 Subject: [PATCH 23/41] Removed OnlyLeaderIndexesTest.java (test is TestAppendReplica now) and back to using UpdateHandler stats in TestAppend and TestPassive --- .../solr/cloud/OnlyLeaderIndexesTest.java | 488 ------------------ .../apache/solr/cloud/TestAppendReplica.java | 30 +- .../apache/solr/cloud/TestPassiveReplica.java | 7 +- 3 files changed, 6 insertions(+), 519 deletions(-) delete mode 100644 solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java diff --git a/solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java b/solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java deleted file mode 100644 index 629740816c36..000000000000 --- a/solr/core/src/test/org/apache/solr/cloud/OnlyLeaderIndexesTest.java +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.cloud; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.EnumSet; -import java.util.List; -import java.util.concurrent.Semaphore; - -import org.apache.lucene.index.IndexWriter; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.embedded.JettySolrRunner; -import org.apache.solr.client.solrj.impl.CloudSolrClient; -import org.apache.solr.client.solrj.request.CollectionAdminRequest; -import org.apache.solr.client.solrj.request.UpdateRequest; -import org.apache.solr.client.solrj.response.CollectionAdminResponse; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.cloud.CollectionStatePredicate; -import org.apache.solr.common.cloud.DocCollection; -import org.apache.solr.common.cloud.Replica; -import org.apache.solr.common.cloud.Slice; -import org.apache.solr.common.cloud.ZkNodeProps; -import org.apache.solr.common.cloud.ZkStateReader; -import org.apache.solr.core.SolrCore; -import org.apache.solr.update.DirectUpdateHandler2; -import org.apache.solr.update.SolrIndexWriter; -import org.apache.solr.update.UpdateHandler; -import org.apache.solr.update.UpdateLog; -import org.apache.solr.util.RefCounted; -import org.apache.zookeeper.KeeperException; -import org.junit.BeforeClass; -import org.junit.Test; - -public class OnlyLeaderIndexesTest extends SolrCloudTestCase { - private static final String COLLECTION = "collection1"; - - @BeforeClass - public static void setupCluster() throws Exception { - System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); - System.setProperty("solr.ulog.numRecordsToKeep", "1000"); - - configureCluster(3) - .addConfig("config", TEST_PATH().resolve("configsets") - .resolve("cloud-minimal-inplace-updates").resolve("conf")) - .configure(); - - CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false"); - CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient()); - assertEquals(0, response.getStatus()); - - CollectionAdminRequest - .createCollection(COLLECTION, "config", 1, 0, 3, 0) - .setMaxShardsPerNode(1) - .process(cluster.getSolrClient()); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 30); - } - - @Test - public void test() throws Exception { - assertNumberOfReplicas(0, 3, 0, false, true); - basicTest(); - recoveryTest(); - dbiTest(); - basicLeaderElectionTest(); - outOfOrderDBQWithInPlaceUpdatesTest(); - } - - public void basicTest() throws Exception { - CloudSolrClient cloudClient = cluster.getSolrClient(); - new UpdateRequest() - .add(sdoc("id", "1")) - .add(sdoc("id", "2")) - .add(sdoc("id", "3")) - .add(sdoc("id", "4")) - .process(cloudClient, COLLECTION); - - { - UpdateHandler updateHandler = getSolrCore(true).get(0).getUpdateHandler(); - RefCounted iwRef = updateHandler.getSolrCoreState().getIndexWriter(null); - assertTrue("IndexWriter at leader must see updates ", iwRef.get().hasUncommittedChanges()); - iwRef.decref(); - } - - for (SolrCore solrCore : getSolrCore(false)) { - RefCounted iwRef = solrCore.getUpdateHandler().getSolrCoreState().getIndexWriter(null); - assertFalse("IndexWriter at replicas must not see updates ", iwRef.get().hasUncommittedChanges()); - iwRef.decref(); - } - - checkRTG(1, 4, cluster.getJettySolrRunners()); - - new UpdateRequest() - .deleteById("1") - .deleteByQuery("id:2") - .process(cloudClient, COLLECTION); - - // The DBQ is not processed at replicas, so we still can get doc2 and other docs by RTG - checkRTG(2,4, getSolrRunner(false)); - - new UpdateRequest() - .commit(cloudClient, COLLECTION); - - checkShardConsistency(2, 1); - - // Update log roll over - for (SolrCore solrCore : getSolrCore(false)) { - UpdateLog updateLog = solrCore.getUpdateHandler().getUpdateLog(); - assertFalse(updateLog.hasUncommittedChanges()); - } - - // UpdateLog copy over old updates - for (int i = 15; i <= 150; i++) { - cloudClient.add(COLLECTION, sdoc("id",String.valueOf(i))); - if (random().nextInt(100) < 15 & i != 150) { - cloudClient.commit(COLLECTION); - } - } - checkRTG(120,150, cluster.getJettySolrRunners()); - waitForReplicasCatchUp(20); - } - - public void recoveryTest() throws Exception { - CloudSolrClient cloudClient = cluster.getSolrClient(); - new UpdateRequest() - .deleteByQuery("*:*") - .commit(cluster.getSolrClient(), COLLECTION); - new UpdateRequest() - .add(sdoc("id", "3")) - .add(sdoc("id", "4")) - .commit(cloudClient, COLLECTION); - // Replica recovery - new UpdateRequest() - .add(sdoc("id", "5")) - .process(cloudClient, COLLECTION); - JettySolrRunner solrRunner = getSolrRunner(false).get(0); - ChaosMonkey.stop(solrRunner); - new UpdateRequest() - .add(sdoc("id", "6")) - .process(cloudClient, COLLECTION); - ChaosMonkey.start(solrRunner); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 30); - // We skip peerSync, so replica will always trigger commit on leader - checkShardConsistency(4, 20); - - // LTR can be kicked off, so waiting for replicas recovery - new UpdateRequest() - .add(sdoc("id", "7")) - .commit(cloudClient, COLLECTION); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 30); - checkShardConsistency(5, 20); - - // More Replica recovery testing - new UpdateRequest() - .add(sdoc("id", "8")) - .process(cloudClient, COLLECTION); - checkRTG(3,8, cluster.getJettySolrRunners()); - DirectUpdateHandler2.commitOnClose = false; - ChaosMonkey.stop(solrRunner); - DirectUpdateHandler2.commitOnClose = true; - ChaosMonkey.start(solrRunner); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 30); - checkRTG(3,8, cluster.getJettySolrRunners()); - checkShardConsistency(6, 20); - - // Test replica recovery apply buffer updates - Semaphore waitingForBufferUpdates = new Semaphore(0); - Semaphore waitingForReplay = new Semaphore(0); - RecoveryStrategy.testing_beforeReplayBufferingUpdates = () -> { - try { - waitingForReplay.release(); - waitingForBufferUpdates.acquire(); - } catch (InterruptedException e) { - e.printStackTrace(); - } - }; - ChaosMonkey.stop(solrRunner); - ChaosMonkey.start(solrRunner); - waitingForReplay.acquire(); - new UpdateRequest() - .add(sdoc("id", "9")) - .add(sdoc("id", "10")) - .process(cloudClient, COLLECTION); - waitingForBufferUpdates.release(); - RecoveryStrategy.testing_beforeReplayBufferingUpdates = null; - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 30); - checkRTG(3,10, cluster.getJettySolrRunners()); - checkShardConsistency(6, 20); - for (SolrCore solrCore : getSolrCore(false)) { - RefCounted iwRef = solrCore.getUpdateHandler().getSolrCoreState().getIndexWriter(null); - assertFalse("IndexWriter at replicas must not see updates ", iwRef.get().hasUncommittedChanges()); - iwRef.decref(); - } - } - - public void dbiTest() throws Exception{ - CloudSolrClient cloudClient = cluster.getSolrClient(); - new UpdateRequest() - .deleteByQuery("*:*") - .commit(cluster.getSolrClient(), COLLECTION); - new UpdateRequest() - .add(sdoc("id", "1")) - .commit(cloudClient, COLLECTION); - checkShardConsistency(1, 1); - new UpdateRequest() - .deleteById("1") - .process(cloudClient, COLLECTION); - try { - checkRTG(1, 1, cluster.getJettySolrRunners()); - } catch (AssertionError e) { - return; - } - fail("Doc1 is deleted but it's still exist"); - } - - public void basicLeaderElectionTest() throws Exception { - CloudSolrClient cloudClient = cluster.getSolrClient(); - new UpdateRequest() - .deleteByQuery("*:*") - .commit(cluster.getSolrClient(), COLLECTION); - new UpdateRequest() - .add(sdoc("id", "1")) - .add(sdoc("id", "2")) - .process(cloudClient, COLLECTION); - String oldLeader = getLeader(); - JettySolrRunner oldLeaderJetty = getSolrRunner(true).get(0); - ChaosMonkey.kill(oldLeaderJetty); - for (int i = 0; i < 60; i++) { // wait till leader is changed - if (!oldLeader.equals(getLeader())) { - break; - } - Thread.sleep(100); - } - new UpdateRequest() - .add(sdoc("id", "3")) - .add(sdoc("id", "4")) - .process(cloudClient, COLLECTION); - ChaosMonkey.start(oldLeaderJetty); - waitForState("Replica not removed", "collection1", activeReplicaCount(0, 3, 0)); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 60); - checkRTG(1,4, cluster.getJettySolrRunners()); - new UpdateRequest() - .commit(cloudClient, COLLECTION); - checkShardConsistency(4,1); - } - - private String getLeader() throws InterruptedException { - ZkNodeProps props = cluster.getSolrClient().getZkStateReader().getLeaderRetry("collection1", "shard1", 30000); - return props.getStr(ZkStateReader.NODE_NAME_PROP); - } - - public void outOfOrderDBQWithInPlaceUpdatesTest() throws Exception { - new UpdateRequest() - .deleteByQuery("*:*") - .commit(cluster.getSolrClient(), COLLECTION); - List updates = new ArrayList<>(); - updates.add(simulatedUpdateRequest(null, "id", 1, "title_s", "title0_new", "inplace_updatable_int", 5, "_version_", Long.MAX_VALUE-100)); // full update - updates.add(simulatedDBQ("inplace_updatable_int:5", Long.MAX_VALUE-98)); - updates.add(simulatedUpdateRequest(Long.MAX_VALUE-100, "id", 1, "inplace_updatable_int", 6, "_version_", Long.MAX_VALUE-99)); - for (JettySolrRunner solrRunner: getSolrRunner(false)) { - try (SolrClient client = solrRunner.newClient()) { - for (UpdateRequest up : updates) { - up.process(client, COLLECTION); - } - } - } - JettySolrRunner oldLeaderJetty = getSolrRunner(true).get(0); - ChaosMonkey.kill(oldLeaderJetty); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 30); - ChaosMonkey.start(oldLeaderJetty); - AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION, cluster.getSolrClient().getZkStateReader(), - false, true, 30); - new UpdateRequest() - .add(sdoc("id", "2")) - .commit(cluster.getSolrClient(), COLLECTION); - checkShardConsistency(2,20); - SolrDocument doc = cluster.getSolrClient().getById(COLLECTION,"1"); - assertNotNull(doc.get("title_s")); - } - - private UpdateRequest simulatedUpdateRequest(Long prevVersion, Object... fields) throws SolrServerException, IOException { - SolrInputDocument doc = sdoc(fields); - - // get baseUrl of the leader - String baseUrl = getBaseUrl(); - - UpdateRequest ur = new UpdateRequest(); - ur.add(doc); - ur.setParam("update.distrib", "FROMLEADER"); - if (prevVersion != null) { - ur.setParam("distrib.inplace.prevversion", String.valueOf(prevVersion)); - ur.setParam("distrib.inplace.update", "true"); - } - ur.setParam("distrib.from", baseUrl); - return ur; - } - - private UpdateRequest simulatedDBQ(String query, long version) throws SolrServerException, IOException { - String baseUrl = getBaseUrl(); - - UpdateRequest ur = new UpdateRequest(); - ur.deleteByQuery(query); - ur.setParam("_version_", ""+version); - ur.setParam("update.distrib", "FROMLEADER"); - ur.setParam("distrib.from", baseUrl); - return ur; - } - - private String getBaseUrl() { - DocCollection collection = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(COLLECTION); - Slice slice = collection.getSlice("shard1"); - return slice.getLeader().getCoreUrl(); - } - - private void checkRTG(int from, int to, List solrRunners) throws Exception{ - - for (JettySolrRunner solrRunner: solrRunners) { - try (SolrClient client = solrRunner.newClient()) { - for (int i = from; i <= to; i++) { - SolrQuery query = new SolrQuery("*:*"); - query.set("distrib", false); - query.setRequestHandler("/get"); - query.set("id",i); - QueryResponse res = client.query(COLLECTION, query); - assertNotNull("Can not find doc "+ i + " in " + solrRunner.getBaseUrl(),res.getResponse().get("doc")); - } - } - } - - } - - private void checkShardConsistency(int expected, int numTry) throws Exception{ - String replicaNotInSync = null; - for (int i = 0; i < numTry; i++) { - boolean inSync = true; - for (JettySolrRunner solrRunner: cluster.getJettySolrRunners()) { - try (SolrClient client = solrRunner.newClient()) { - SolrQuery query = new SolrQuery("*:*"); - query.set("distrib", false); - long results = client.query(COLLECTION, query).getResults().getNumFound(); - if (expected != results) { - inSync = false; - replicaNotInSync = solrRunner.getNodeName(); - break; - } - } - } - if (inSync) return; - Thread.sleep(500); - } - - fail("Some replicas are not in sync with leader: " + replicaNotInSync); - } - - private void waitForReplicasCatchUp(int numTry) throws IOException, InterruptedException { - String leaderTimeCommit = getSolrCore(true).get(0).getDeletionPolicy().getLatestCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); - if (leaderTimeCommit == null) return; - for (int i = 0; i < numTry; i++) { - boolean inSync = true; - for (SolrCore solrCore : getSolrCore(false)) { - String replicateTimeCommit = solrCore.getDeletionPolicy().getLatestCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); - if (!leaderTimeCommit.equals(replicateTimeCommit)) { - inSync = false; - Thread.sleep(500); - break; - } - } - if (inSync) return; - } - - fail("Some replicas are not in sync with leader"); - - } - - private List getSolrCore(boolean isLeader) { - List rs = new ArrayList<>(); - - CloudSolrClient cloudClient = cluster.getSolrClient(); - DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(COLLECTION); - - for (JettySolrRunner solrRunner : cluster.getJettySolrRunners()) { - if (solrRunner.getCoreContainer() == null) continue; - for (SolrCore solrCore : solrRunner.getCoreContainer().getCores()) { - CloudDescriptor cloudDescriptor = solrCore.getCoreDescriptor().getCloudDescriptor(); - Slice slice = docCollection.getSlice(cloudDescriptor.getShardId()); - Replica replica = docCollection.getReplica(cloudDescriptor.getCoreNodeName()); - if (slice.getLeader() == replica && isLeader) { - rs.add(solrCore); - } else if (slice.getLeader() != replica && !isLeader) { - rs.add(solrCore); - } - } - } - return rs; - } - - private List getSolrRunner(boolean isLeader) { - List rs = new ArrayList<>(); - - CloudSolrClient cloudClient = cluster.getSolrClient(); - DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(COLLECTION); - - for (JettySolrRunner solrRunner : cluster.getJettySolrRunners()) { - if (solrRunner.getCoreContainer() == null) continue; - for (SolrCore solrCore : solrRunner.getCoreContainer().getCores()) { - CloudDescriptor cloudDescriptor = solrCore.getCoreDescriptor().getCloudDescriptor(); - Slice slice = docCollection.getSlice(cloudDescriptor.getShardId()); - Replica replica = docCollection.getReplica(cloudDescriptor.getCoreNodeName()); - if (slice.getLeader() == replica && isLeader) { - rs.add(solrRunner); - } else if (slice.getLeader() != replica && !isLeader) { - rs.add(solrRunner); - } - } - } - return rs; - } - - // TODO: This is copy/paste from TestPassiveReplica, refactor - private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int numPassive, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { - if (updateCollection) { - cluster.getSolrClient().getZkStateReader().forceUpdateCollection("collection1"); - } - DocCollection docCollection = getCollectionState("collection1"); - assertNotNull(docCollection); - assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, - docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, - docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - assertEquals("Unexpected number of active replicas: " + docCollection, numActive, - docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - return docCollection; - } - - private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive, int numPassive) { - return (liveNodes, collectionState) -> { - int writersFound = 0, activesFound = 0, passivesFound = 0; - if (collectionState == null) - return false; - for (Slice slice : collectionState) { - for (Replica replica : slice) { - if (replica.isActive(liveNodes)) - switch (replica.getType()) { - case APPEND: - activesFound++; - break; - case PASSIVE: - passivesFound++; - break; - case REALTIME: - writersFound++; - break; - default: - throw new AssertionError("Unexpected replica type"); - } - } - } - return numWriter == writersFound && numActive == activesFound && numPassive == passivesFound; - }; - } - -} \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java index b56c5a732c99..23dde01b1152 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java @@ -24,6 +24,7 @@ import java.util.EnumSet; import java.util.List; import java.util.Locale; +import java.util.Map; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -50,6 +51,7 @@ import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.update.DirectUpdateHandler2; import org.apache.solr.update.SolrIndexWriter; @@ -114,28 +116,6 @@ public void tearDown() throws Exception { super.tearDown(); } - // Just to compare test time, nocommit - @Ignore - public void testCreateDelete2() throws Exception { - try { - CollectionAdminRequest.createCollection(collectionName, "conf", 1, 8, 0, 0).process(cluster.getSolrClient()); - DocCollection docCollection = getCollectionState(collectionName); - assertNotNull(docCollection); -// assertEquals("Expecting 4 relpicas per shard", -// 8, docCollection.getReplicas().size()); -// assertEquals("Expecting 6 passive replicas, 3 per shard", -// 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); -// assertEquals("Expecting 2 writer replicas, one per shard", -// 2, docCollection.getReplicas(EnumSet.of(Replica.Type.WRITER)).size()); -// for (Slice s:docCollection.getSlices()) { -// // read-only replicas can never become leaders -// assertFalse(s.getLeader().isReadOnly()); -// } - } finally { - zkClient().printLayoutToStdOut(); - } - } - /** * Asserts that Update logs exist for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#REALTIME}, but not * for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} @@ -186,6 +166,7 @@ public void testCreateDelete() throws Exception { } } + @SuppressWarnings("unchecked") public void testAddDocs() throws Exception { int numAppendReplicas = 1 + random().nextInt(3); DocCollection docCollection = createAndWaitForCollection(1, 0, numAppendReplicas, 0); @@ -212,9 +193,8 @@ public void testAddDocs() throws Exception { "qt", "/admin/plugins", "stats", "true"); QueryResponse statsResponse = appendReplicaClient.query(req); -// TODO: uncomment when SOLR-10569 is fixed -// assertEquals("Append replicas should recive all updates. Replica: " + r + ", response: " + statsResponse, -// 1L, ((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats", "cumulative_adds")); + assertEquals("Append replicas should recive all updates. Replica: " + r + ", response: " + statsResponse, + 1L, ((Map)((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats")).get("UPDATE.updateHandler.cumulativeAdds.count")); break; } catch (AssertionError e) { if (t.hasTimedOut()) { diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java index e1f597bab916..70f2627e8167 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java @@ -63,9 +63,6 @@ public class TestPassiveReplica extends SolrCloudTestCase { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - // TODO: Backup/Snapshot should not work on passive replicas - // TODO: ADDSHARD operation - private String collectionName = null; private final static int REPLICATION_TIMEOUT_SECS = 10; @@ -231,10 +228,8 @@ public void testAddDocs() throws Exception { "qt", "/admin/plugins", "stats", "true"); QueryResponse statsResponse = readOnlyReplicaClient.query(req); -// assertEquals("Replicas shouldn't process the add document request: " + statsResponse, -// 0L, ((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats", "adds")); assertEquals("Replicas shouldn't process the add document request: " + statsResponse, - 0L, ((Map)((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "/update", "stats")).get("UPDATE./update.requests")); + 0L, ((Map)((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats")).get("UPDATE.updateHandler.adds")); } } assertUlogPresence(docCollection); From 71e1149d40bbc700aef34d98907142ef38e8dbb8 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 10 May 2017 15:09:45 -0700 Subject: [PATCH 24/41] Reset TestInjector after modifying it in tests --- solr/core/src/java/org/apache/solr/util/TestInjection.java | 6 ++++-- .../ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java | 1 + .../ChaosMonkeySafeLeaderWithPassiveReplicasTest.java | 1 + .../src/test/org/apache/solr/cloud/TestAppendReplica.java | 7 ++++++- .../src/test/org/apache/solr/cloud/TestPassiveReplica.java | 6 ++++++ .../apache/solr/cloud/TestPassiveReplicaErrorHandling.java | 1 + 6 files changed, 19 insertions(+), 3 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java index 6b7b1f86bd00..f3eb4841b390 100644 --- a/solr/core/src/java/org/apache/solr/util/TestInjection.java +++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java @@ -75,7 +75,7 @@ public TestShutdownFailError(String msg) { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final Pattern ENABLED_PERCENT = Pattern.compile("(true|false)(?:\\:(\\d+))?$", Pattern.CASE_INSENSITIVE); - + private static final String LUCENE_TEST_CASE_FQN = "org.apache.lucene.util.LuceneTestCase"; /** @@ -151,6 +151,7 @@ public static void reset() { splitFailureBeforeReplicaCreation = null; prepRecoveryOpPauseForever = null; countPrepRecoveryOpPauseForever = new AtomicInteger(0); + waitForReplicasInSync = "true:60"; for (Timer timer : timers) { timer.cancel(); @@ -387,9 +388,10 @@ public static boolean waitForInSyncWithLeader(SolrCore core, ZkController zkCont String localVersion = searcher.get().getIndexReader().getIndexCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); if (localVersion == null && leaderVersion == 0 && !core.getUpdateHandler().getUpdateLog().hasUncommittedChanges()) return true; if (localVersion != null && Long.parseLong(localVersion) == leaderVersion && (leaderVersion >= t || i >= 6)) { - log.info("Waiting time for replica in sync with leader: {}", System.currentTimeMillis()-currentTime); + log.info("Waiting time for append replica to be in sync with leader: {}", System.currentTimeMillis()-currentTime); return true; } else { + log.debug("Append replica not in sync with leader yet. Attempt: {}", i); Thread.sleep(500); } } finally { diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java index ce35529660a9..94d6e4557ad2 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java @@ -78,6 +78,7 @@ public static void beforeSuperClass() { public static void afterSuperClass() { System.clearProperty("solr.autoCommit.maxTime"); clearErrorHook(); + TestInjection.reset(); } protected static final String[] fieldNames = new String[]{"f_i", "f_f", "f_d", "f_l", "f_dt"}; diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java index 556a61b79343..8639fba0771b 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java @@ -73,6 +73,7 @@ public static void beforeSuperClass() { public static void afterSuperClass() { System.clearProperty("solr.autoCommit.maxTime"); clearErrorHook(); + TestInjection.reset(); } protected static final String[] fieldNames = new String[]{"f_i", "f_f", "f_d", "f_l", "f_dt"}; diff --git a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java index 23dde01b1152..a0cc1f50a81c 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java @@ -61,8 +61,8 @@ import org.apache.solr.util.TestInjection; import org.apache.solr.util.TimeOut; import org.apache.zookeeper.KeeperException; +import org.junit.AfterClass; import org.junit.BeforeClass; -import org.junit.Ignore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,6 +93,11 @@ public static void setupCluster() throws Exception { assertEquals(0, response.getStatus()); } + @AfterClass + public static void tearDownCluster() { + TestInjection.reset(); + } + @Override public void setUp() throws Exception { super.setUp(); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java index 70f2627e8167..c9cacbbd751b 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java @@ -51,6 +51,7 @@ import org.apache.solr.util.TestInjection; import org.apache.solr.util.TimeOut; import org.apache.zookeeper.KeeperException; +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Ignore; import org.slf4j.Logger; @@ -83,6 +84,11 @@ public static void setupCluster() throws Exception { assertEquals(0, response.getStatus()); } + @AfterClass + public static void tearDownCluster() { + TestInjection.reset(); + } + @Override public void setUp() throws Exception { super.setUp(); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java index 7dd147a08fd4..f331ef6cd1fb 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java @@ -108,6 +108,7 @@ public static void tearDownCluster() throws Exception { } proxies = null; jettys = null; + TestInjection.reset(); } @Override From 88fa86c6d813928866464202702dec86dc54e801 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 10 May 2017 17:22:31 -0700 Subject: [PATCH 25/41] Fixed Backup/Restore failure --- .../solr/cloud/AbstractCloudBackupRestoreTestCase.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java index e9cebc9eb4da..d651e68620b1 100644 --- a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java @@ -82,13 +82,15 @@ public void test() throws Exception { boolean isImplicit = random().nextBoolean(); boolean doSplitShardOperation = !isImplicit && random().nextBoolean(); int replFactor = TestUtil.nextInt(random(), 1, 2); - // Split Shard not supported with replica types int numAppendReplicas = TestUtil.nextInt(random(), 0, 1); int numPassiveReplicas = TestUtil.nextInt(random(), 0, 1); CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor, numAppendReplicas, numPassiveReplicas); if (NUM_SHARDS * (replFactor + numAppendReplicas + numPassiveReplicas) > cluster.getJettySolrRunners().size() || random().nextBoolean()) { create.setMaxShardsPerNode((int)Math.ceil(NUM_SHARDS * (replFactor + numAppendReplicas + numPassiveReplicas) / cluster.getJettySolrRunners().size()));//just to assert it survives the restoration + if (doSplitShardOperation) { + create.setMaxShardsPerNode(create.getMaxShardsPerNode() * 2); + } } if (random().nextBoolean()) { create.setAutoAddReplicas(true);//just to assert it survives the restoration @@ -235,9 +237,9 @@ private void testBackupAndRestore(String collectionName) throws Exception { CollectionAdminRequest.Restore restore = CollectionAdminRequest.restoreCollection(restoreCollectionName, backupName) .setLocation(backupLocation).setRepositoryName(getBackupRepoName()); - if (origShardToDocCount.size() > cluster.getJettySolrRunners().size()) { + if (backupCollection.getReplicas().size() > cluster.getJettySolrRunners().size()) { // may need to increase maxShardsPerNode (e.g. if it was shard split, then now we need more) - restore.setMaxShardsPerNode(origShardToDocCount.size()); + restore.setMaxShardsPerNode((int)Math.ceil(backupCollection.getReplicas().size()/cluster.getJettySolrRunners().size())); } Properties props = new Properties(); props.setProperty("customKey", "customVal"); From 1c0e2f20d5dd85cf26ff1a3df880f2e5f6e4d943 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Thu, 11 May 2017 15:09:47 -0700 Subject: [PATCH 26/41] Prevent NPE in ReplicateFromLeader --- .../org/apache/solr/cloud/ReplicateFromLeader.java | 4 +++- .../java/org/apache/solr/cloud/ZkController.java | 14 +++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index 44410186dbed..0800e0f38e81 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -127,6 +127,8 @@ private static String toPollIntervalStr(int ms) { } public void stopReplication() { - replicationProcess.close(); + if (replicationProcess != null) { + replicationProcess.close(); + } } } diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 11fff1f09f0f..02133d669cae 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -974,10 +974,12 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov public void startReplicationFromLeader(String coreName, boolean switchTransactionLog) throws InterruptedException { log.info("{} starting background replication from leader", coreName); ReplicateFromLeader replicateFromLeader = new ReplicateFromLeader(cc, coreName); - if (replicateFromLeaders.putIfAbsent(coreName, replicateFromLeader) == null) { - replicateFromLeader.startReplication(switchTransactionLog); - } else { - log.warn("A replicate from leader instance already exists for core {}", coreName); + synchronized (replicateFromLeader) { // synchronize to prevent any stop before we finish the start + if (replicateFromLeaders.putIfAbsent(coreName, replicateFromLeader) == null) { + replicateFromLeader.startReplication(switchTransactionLog); + } else { + log.warn("A replicate from leader instance already exists for core {}", coreName); + } } } @@ -985,7 +987,9 @@ public void stopReplicationFromLeader(String coreName) { log.info("{} stopping background replication from leader", coreName); ReplicateFromLeader replicateFromLeader = replicateFromLeaders.remove(coreName); if (replicateFromLeader != null) { - replicateFromLeader.stopReplication(); + synchronized (replicateFromLeader) { + replicateFromLeader.stopReplication(); + } } } From 9e82fd45a8fbd9a2662f8bb4775a8fea7fb5b8ac Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Fri, 12 May 2017 14:48:38 -0700 Subject: [PATCH 27/41] Rename replica types --- .../org/apache/solr/cloud/AddReplicaCmd.java | 2 +- .../apache/solr/cloud/CloudDescriptor.java | 4 +- .../solr/cloud/CreateCollectionCmd.java | 24 ++-- .../org/apache/solr/cloud/CreateShardCmd.java | 42 +++--- .../apache/solr/cloud/ElectionContext.java | 6 +- .../org/apache/solr/cloud/MigrateCmd.java | 8 +- .../OverseerCollectionMessageHandler.java | 30 ++--- .../apache/solr/cloud/RecoveryStrategy.java | 12 +- .../org/apache/solr/cloud/RestoreCmd.java | 54 ++++---- .../org/apache/solr/cloud/SplitShardCmd.java | 2 +- .../org/apache/solr/cloud/ZkController.java | 24 ++-- .../solr/cloud/rule/ReplicaAssigner.java | 2 +- .../org/apache/solr/core/CoreContainer.java | 8 +- .../handler/admin/CollectionsHandler.java | 6 +- .../handler/component/HttpShardHandler.java | 8 +- .../component/RealTimeGetComponent.java | 4 +- .../solr/update/DirectUpdateHandler2.java | 2 +- .../org/apache/solr/update/UpdateHandler.java | 2 +- .../org/apache/solr/update/UpdateLog.java | 2 +- .../processor/DistributedUpdateProcessor.java | 20 +-- .../org/apache/solr/util/TestInjection.java | 2 +- .../AbstractCloudBackupRestoreTestCase.java | 10 +- .../solr/cloud/BasicDistributedZk2Test.java | 2 +- .../solr/cloud/BasicDistributedZkTest.java | 2 +- .../cloud/ChaosMonkeyNothingIsSafeTest.java | 2 +- ...keyNothingIsSafeWithPullReplicasTest.java} | 34 ++--- ...MonkeySafeLeaderWithPullReplicasTest.java} | 34 ++--- .../CollectionsAPIDistributedZkTest.java | 4 +- .../solr/cloud/CollectionsAPISolrJTest.java | 12 +- .../apache/solr/cloud/ForceLeaderTest.java | 2 +- .../apache/solr/cloud/HttpPartitionTest.java | 2 +- .../LeaderInitiatedRecoveryOnCommitTest.java | 2 +- ...rseerCollectionConfigSetProcessorTest.java | 2 +- .../cloud/RecoveryAfterSoftCommitTest.java | 2 +- .../org/apache/solr/cloud/ShardSplitTest.java | 2 +- .../apache/solr/cloud/TestCloudRecovery.java | 2 +- .../apache/solr/cloud/TestCollectionAPI.java | 4 +- ...ssiveReplica.java => TestPullReplica.java} | 126 ++++++++---------- ...java => TestPullReplicaErrorHandling.java} | 50 +++---- ...ppendReplica.java => TestTlogReplica.java} | 106 +++++++-------- .../TestTolerantUpdateProcessorCloud.java | 4 +- .../hdfs/HdfsBasicDistributedZkTest.java | 2 +- .../update/TestInPlaceUpdatesDistrib.java | 8 +- .../client/solrj/impl/CloudSolrClient.java | 2 +- .../solrj/request/CollectionAdminRequest.java | 74 +++++----- .../solr/common/cloud/DocCollection.java | 36 ++--- .../org/apache/solr/common/cloud/Replica.java | 24 ++-- .../org/apache/solr/common/cloud/Slice.java | 2 +- .../solr/common/cloud/ZkStateReader.java | 8 +- ...lectionAdminRequestRequiredParamsTest.java | 2 +- .../cloud/AbstractFullDistribZkTestBase.java | 110 +++++++-------- .../org/apache/solr/cloud/ChaosMonkey.java | 8 +- 52 files changed, 461 insertions(+), 483 deletions(-) rename solr/core/src/test/org/apache/solr/cloud/{ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java => ChaosMonkeyNothingIsSafeWithPullReplicasTest.java} (90%) rename solr/core/src/test/org/apache/solr/cloud/{ChaosMonkeySafeLeaderWithPassiveReplicasTest.java => ChaosMonkeySafeLeaderWithPullReplicasTest.java} (85%) rename solr/core/src/test/org/apache/solr/cloud/{TestPassiveReplica.java => TestPullReplica.java} (83%) rename solr/core/src/test/org/apache/solr/cloud/{TestPassiveReplicaErrorHandling.java => TestPullReplicaErrorHandling.java} (84%) rename solr/core/src/test/org/apache/solr/cloud/{TestAppendReplica.java => TestTlogReplica.java} (89%) diff --git a/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java index 4420a9209fca..7338d9e645e0 100644 --- a/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/AddReplicaCmd.java @@ -72,7 +72,7 @@ ZkNodeProps addReplica(ClusterState clusterState, ZkNodeProps message, NamedList String node = message.getStr(CoreAdminParams.NODE); String shard = message.getStr(SHARD_ID_PROP); String coreName = message.getStr(CoreAdminParams.NAME); - Replica.Type replicaType = Replica.Type.valueOf(message.getStr(ZkStateReader.REPLICA_TYPE, Replica.Type.REALTIME.name())); + Replica.Type replicaType = Replica.Type.valueOf(message.getStr(ZkStateReader.REPLICA_TYPE, Replica.Type.NRT.name())); boolean parallel = message.getBool("parallel", false); if (StringUtils.isBlank(coreName)) { coreName = message.getStr(CoreAdminParams.PROPERTY_PREFIX + CoreAdminParams.NAME); diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java index 8a37be58d551..32cb65bde488 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java +++ b/solr/core/src/java/org/apache/solr/cloud/CloudDescriptor.java @@ -67,7 +67,7 @@ public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { this.numShards = PropertiesUtil.toInteger(props.getProperty(CloudDescriptor.NUM_SHARDS), null); String replicaTypeStr = props.getProperty(CloudDescriptor.REPLICA_TYPE); if (Strings.isNullOrEmpty(replicaTypeStr)) { - this.replicaType = Replica.Type.REALTIME; + this.replicaType = Replica.Type.NRT; } else { this.replicaType = Replica.Type.valueOf(replicaTypeStr); } @@ -79,7 +79,7 @@ public CloudDescriptor(String coreName, Properties props, CoreDescriptor cd) { } public boolean requiresTransactionLog() { - return this.replicaType != Replica.Type.PASSIVE; + return this.replicaType != Replica.Type.PULL; } public Replica.State getLastPublished() { diff --git a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java index 2cc331096712..3d1a54e45e37 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/CreateCollectionCmd.java @@ -95,9 +95,9 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul // look at the replication factor and see if it matches reality // if it does not, find best nodes to create more cores - int numRealtimeReplicas = message.getInt(REALTIME_REPLICAS, message.getInt(REPLICATION_FACTOR, 1)); - int numPassiveReplicas = message.getInt(PASSIVE_REPLICAS, 0); - int numAppendReplicas = message.getInt(APPEND_REPLICAS, 0); + int numNrtReplicas = message.getInt(NRT_REPLICAS, message.getInt(REPLICATION_FACTOR, 1)); + int numPullReplicas = message.getInt(PULL_REPLICAS, 0); + int numTlogReplicas = message.getInt(TLOG_REPLICAS, 0); ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(); final String async = message.getStr(ASYNC); @@ -117,8 +117,8 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul int maxShardsPerNode = message.getInt(MAX_SHARDS_PER_NODE, 1); - if (numRealtimeReplicas + numAppendReplicas <= 0) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, REALTIME_REPLICAS + " + " + APPEND_REPLICAS + " must be greater than 0"); + if (numNrtReplicas + numTlogReplicas <= 0) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, NRT_REPLICAS + " + " + TLOG_REPLICAS + " must be greater than 0"); } if (numSlices <= 0) { @@ -136,7 +136,7 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul positionVsNodes = new HashMap<>(); } else { - int totalNumReplicas = numRealtimeReplicas + numAppendReplicas + numPassiveReplicas; + int totalNumReplicas = numNrtReplicas + numTlogReplicas + numPullReplicas; if (totalNumReplicas > nodeList.size()) { log.warn("Specified number of replicas of " + totalNumReplicas @@ -155,14 +155,14 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul + ", and the number of nodes currently live or live and part of your "+CREATE_NODE_SET+" is " + nodeList.size() + ". This allows a maximum of " + maxShardsAllowedToCreate + " to be created. Value of " + NUM_SLICES + " is " + numSlices - + ", value of " + REALTIME_REPLICAS + " is " + numRealtimeReplicas - + ", value of " + APPEND_REPLICAS + " is " + numAppendReplicas - + " and value of " + PASSIVE_REPLICAS + " is " + numPassiveReplicas + + ", value of " + NRT_REPLICAS + " is " + numNrtReplicas + + ", value of " + TLOG_REPLICAS + " is " + numTlogReplicas + + " and value of " + PULL_REPLICAS + " is " + numPullReplicas + ". This requires " + requestedShardsToCreate + " shards to be created (higher than the allowed number)"); } - positionVsNodes = ocmh.identifyNodes(clusterState, nodeList, message, shardNames, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas); + positionVsNodes = ocmh.identifyNodes(clusterState, nodeList, message, shardNames, numNrtReplicas, numTlogReplicas, numPullReplicas); } ZkStateReader zkStateReader = ocmh.zkStateReader; @@ -202,8 +202,8 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul Map requestMap = new HashMap<>(); - log.debug(formatString("Creating SolrCores for new collection {0}, shardNames {1} , realtimeReplicas : {2}, appendReplicas: {3}, passiveReplicas: {4}", - collectionName, shardNames, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas)); + log.debug(formatString("Creating SolrCores for new collection {0}, shardNames {1} , nrtReplicas : {2}, tlogReplicas: {3}, pullReplicas: {4}", + collectionName, shardNames, numNrtReplicas, numTlogReplicas, numPullReplicas)); Map coresToCreate = new LinkedHashMap<>(); for (Map.Entry e : positionVsNodes.entrySet()) { ReplicaAssigner.Position position = e.getKey(); diff --git a/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java index f38009cfe289..d3eb828dc31f 100644 --- a/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/CreateShardCmd.java @@ -42,10 +42,10 @@ import static org.apache.solr.cloud.Assign.getNodesForNewReplicas; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_CONF; -import static org.apache.solr.common.cloud.ZkStateReader.APPEND_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.PASSIVE_REPLICAS; -import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.PULL_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; import static org.apache.solr.common.params.CommonAdminParams.ASYNC; @@ -72,13 +72,13 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(); DocCollection collection = clusterState.getCollection(collectionName); // int repFactor = message.getInt(REPLICATION_FACTOR, collection.getInt(REPLICATION_FACTOR, 1)); - int numRealtimeReplicas = message.getInt(REALTIME_REPLICAS, message.getInt(REPLICATION_FACTOR, collection.getInt(REALTIME_REPLICAS, collection.getInt(REPLICATION_FACTOR, 1)))); - int numPassiveReplicas = message.getInt(PASSIVE_REPLICAS, collection.getInt(PASSIVE_REPLICAS, 0)); - int numAppendReplicas = message.getInt(APPEND_REPLICAS, collection.getInt(APPEND_REPLICAS, 0)); - int totalReplicas = numRealtimeReplicas + numPassiveReplicas + numAppendReplicas; + int numNrtReplicas = message.getInt(NRT_REPLICAS, message.getInt(REPLICATION_FACTOR, collection.getInt(NRT_REPLICAS, collection.getInt(REPLICATION_FACTOR, 1)))); + int numPullReplicas = message.getInt(PULL_REPLICAS, collection.getInt(PULL_REPLICAS, 0)); + int numTlogReplicas = message.getInt(TLOG_REPLICAS, collection.getInt(TLOG_REPLICAS, 0)); + int totalReplicas = numNrtReplicas + numPullReplicas + numTlogReplicas; - if (numRealtimeReplicas + numAppendReplicas <= 0) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, REALTIME_REPLICAS + " + " + APPEND_REPLICAS + " must be greater than 0"); + if (numNrtReplicas + numTlogReplicas <= 0) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, NRT_REPLICAS + " + " + TLOG_REPLICAS + " must be greater than 0"); } Object createNodeSetStr = message.get(OverseerCollectionMessageHandler.CREATE_NODE_SET); @@ -106,23 +106,23 @@ public void call(ClusterState clusterState, ZkNodeProps message, NamedList resul requestMap = new HashMap<>(totalReplicas, 1.0f); } - int createdRealtimeReplicas = 0, createdAppendReplicas = 0, createdPassiveReplicas = 0; + int createdNrtReplicas = 0, createdTlogReplicas = 0, createdPullReplicas = 0; for (int j = 1; j <= totalReplicas; j++) { int coreNameNumber; Replica.Type typeToCreate; - if (createdRealtimeReplicas < numRealtimeReplicas) { - createdRealtimeReplicas++; - coreNameNumber = createdRealtimeReplicas; - typeToCreate = Replica.Type.REALTIME; - } else if (createdAppendReplicas < numAppendReplicas) { - createdAppendReplicas++; - coreNameNumber = createdAppendReplicas; - typeToCreate = Replica.Type.APPEND; + if (createdNrtReplicas < numNrtReplicas) { + createdNrtReplicas++; + coreNameNumber = createdNrtReplicas; + typeToCreate = Replica.Type.NRT; + } else if (createdTlogReplicas < numTlogReplicas) { + createdTlogReplicas++; + coreNameNumber = createdTlogReplicas; + typeToCreate = Replica.Type.TLOG; } else { - createdPassiveReplicas++; - coreNameNumber = createdPassiveReplicas; - typeToCreate = Replica.Type.PASSIVE; + createdPullReplicas++; + coreNameNumber = createdPullReplicas; + typeToCreate = Replica.Type.PULL; } String nodeName = sortedNodeList.get(((j - 1)) % sortedNodeList.size()).nodeName; String coreName = Assign.buildCoreName(collectionName, sliceName, typeToCreate, coreNameNumber); diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java index 21549bcea25f..588262d02feb 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java +++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java @@ -428,7 +428,7 @@ void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStart) throws Kee try { // we must check LIR before registering as leader checkLIR(coreName, allReplicasInLine); - if (replicaType == Replica.Type.APPEND) { + if (replicaType == Replica.Type.TLOG) { // stop replicate from old leader zkController.stopReplicationFromLeader(coreName); if (weAreReplacement) { @@ -624,7 +624,7 @@ private boolean waitForReplicasToComeUp(int timeoutms) throws InterruptedExcepti } // on startup and after connection timeout, wait for all known shards - if (found >= slices.getReplicas(EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)).size()) { + if (found >= slices.getReplicas(EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT)).size()) { log.info("Enough replicas found to continue."); return true; } else { @@ -632,7 +632,7 @@ private boolean waitForReplicasToComeUp(int timeoutms) throws InterruptedExcepti log.info("Waiting until we see more replicas up for shard {}: total={}" + " found={}" + " timeoutin={}ms", - shardId, slices.getReplicas(EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)).size(), found, + shardId, slices.getReplicas(EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT)).size(), found, TimeUnit.MILLISECONDS.convert(timeoutAt - System.nanoTime(), TimeUnit.NANOSECONDS)); } } diff --git a/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java b/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java index a1a41bb8207b..0ea5d6e6b87a 100644 --- a/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/MigrateCmd.java @@ -51,7 +51,7 @@ import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET; import static org.apache.solr.cloud.OverseerCollectionMessageHandler.NUM_SLICES; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; -import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE; @@ -208,7 +208,7 @@ private void migrateKey(ClusterState clusterState, DocCollection sourceCollectio Map props = makeMap( Overseer.QUEUE_OPERATION, CREATE.toLower(), NAME, tempSourceCollectionName, - REALTIME_REPLICAS, 1, + NRT_REPLICAS, 1, NUM_SLICES, 1, COLL_CONF, configName, CREATE_NODE_SET, sourceLeader.getNodeName()); @@ -224,7 +224,7 @@ private void migrateKey(ClusterState clusterState, DocCollection sourceCollectio Slice tempSourceSlice = clusterState.getCollection(tempSourceCollectionName).getSlices().iterator().next(); Replica tempSourceLeader = zkStateReader.getLeaderRetry(tempSourceCollectionName, tempSourceSlice.getName(), 120000); - String tempCollectionReplica1 = Assign.buildCoreName(tempSourceCollectionName, tempSourceSlice.getName(), Replica.Type.REALTIME, 1); + String tempCollectionReplica1 = Assign.buildCoreName(tempSourceCollectionName, tempSourceSlice.getName(), Replica.Type.NRT, 1); String coreNodeName = ocmh.waitForCoreNodeName(tempSourceCollectionName, sourceLeader.getNodeName(), tempCollectionReplica1); // wait for the replicas to be seen as active on temp source leader @@ -257,7 +257,7 @@ private void migrateKey(ClusterState clusterState, DocCollection sourceCollectio log.info("Creating a replica of temporary collection: {} on the target leader node: {}", tempSourceCollectionName, targetLeader.getNodeName()); - String tempCollectionReplica2 = Assign.buildCoreName(tempSourceCollectionName, tempSourceSlice.getName(), Replica.Type.REALTIME, 2); + String tempCollectionReplica2 = Assign.buildCoreName(tempSourceCollectionName, tempSourceSlice.getName(), Replica.Type.NRT, 2); props = new HashMap<>(); props.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower()); props.put(COLLECTION_PROP, tempSourceCollectionName); diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java index 0af3460a90e7..1615a0b14a39 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java @@ -159,9 +159,9 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler public static final Map COLL_PROPS = Collections.unmodifiableMap(makeMap( ROUTER, DocRouter.DEFAULT_NAME, ZkStateReader.REPLICATION_FACTOR, "1", - ZkStateReader.REALTIME_REPLICAS, "1", - ZkStateReader.APPEND_REPLICAS, "0", - ZkStateReader.PASSIVE_REPLICAS, "0", + ZkStateReader.NRT_REPLICAS, "1", + ZkStateReader.TLOG_REPLICAS, "0", + ZkStateReader.PULL_REPLICAS, "0", ZkStateReader.MAX_SHARDS_PER_NODE, "1", ZkStateReader.AUTO_ADD_REPLICAS, "false", DocCollection.RULE, null, @@ -733,32 +733,32 @@ Map identifyNodes(ClusterState clusterState, List nodeList, ZkNodeProps message, List shardNames, - int numRealtimeReplicas, - int numAppendReplicas, - int numPassiveReplicas) throws IOException { + int numNrtReplicas, + int numTlogReplicas, + int numPullReplicas) throws IOException { List rulesMap = (List) message.get("rule"); if (rulesMap == null) { int i = 0; Map result = new HashMap<>(); for (String aShard : shardNames) { - for (int j = 0; j < numRealtimeReplicas; j++){ - result.put(new Position(aShard, j, Replica.Type.REALTIME), nodeList.get(i % nodeList.size())); + for (int j = 0; j < numNrtReplicas; j++){ + result.put(new Position(aShard, j, Replica.Type.NRT), nodeList.get(i % nodeList.size())); i++; } - for (int j = 0; j < numAppendReplicas; j++){ - result.put(new Position(aShard, j, Replica.Type.APPEND), nodeList.get(i % nodeList.size())); + for (int j = 0; j < numTlogReplicas; j++){ + result.put(new Position(aShard, j, Replica.Type.TLOG), nodeList.get(i % nodeList.size())); i++; } - for (int j = 0; j < numPassiveReplicas; j++){ - result.put(new Position(aShard, j, Replica.Type.PASSIVE), nodeList.get(i % nodeList.size())); + for (int j = 0; j < numPullReplicas; j++){ + result.put(new Position(aShard, j, Replica.Type.PULL), nodeList.get(i % nodeList.size())); i++; } } return result; } else { - if (numAppendReplicas + numPassiveReplicas != 0) { + if (numTlogReplicas + numPullReplicas != 0) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, - Replica.Type.APPEND + " or " + Replica.Type.PASSIVE + " replica types not supported with placement rules"); + Replica.Type.TLOG + " or " + Replica.Type.PULL + " replica types not supported with placement rules"); } } @@ -767,7 +767,7 @@ Map identifyNodes(ClusterState clusterState, Map sharVsReplicaCount = new HashMap<>(); - for (String shard : shardNames) sharVsReplicaCount.put(shard, numRealtimeReplicas); + for (String shard : shardNames) sharVsReplicaCount.put(shard, numNrtReplicas); ReplicaAssigner replicaAssigner = new ReplicaAssigner(rules, sharVsReplicaCount, (List) message.get(SNITCH), diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index 3c8c1f4f99db..da75195b30e4 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -325,12 +325,12 @@ final private void doReplicateOnlyRecovery(SolrCore core) throws InterruptedExce String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName); - boolean isLeader = leaderUrl.equals(ourUrl); //TODO: We can probably delete most of this code if we say this strategy can only be used for passive replicas + boolean isLeader = leaderUrl.equals(ourUrl); //TODO: We can probably delete most of this code if we say this strategy can only be used for pull replicas if (isLeader && !cloudDesc.isLeader()) { throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader."); } if (cloudDesc.isLeader()) { - assert cloudDesc.getReplicaType() != Replica.Type.PASSIVE; + assert cloudDesc.getReplicaType() != Replica.Type.PULL; // we are now the leader - no one else must have been suitable LOG.warn("We have not yet recovered - but we are now the leader!"); LOG.info("Finished recovery process."); @@ -461,7 +461,7 @@ final public void doSyncOrReplicateRecovery(SolrCore core) throws KeeperExceptio } // we temporary ignore peersync for Append replicas - boolean firstTime = replicaType != Replica.Type.APPEND; + boolean firstTime = replicaType != Replica.Type.TLOG; List recentVersions; try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) { @@ -513,7 +513,7 @@ final public void doSyncOrReplicateRecovery(SolrCore core) throws KeeperExceptio } } - if (replicaType == Replica.Type.APPEND) { + if (replicaType == Replica.Type.TLOG) { zkController.stopReplicationFromLeader(coreName); } @@ -670,7 +670,7 @@ final public void doSyncOrReplicateRecovery(SolrCore core) throws KeeperExceptio if (successfulRecovery) { LOG.info("Registering as Active after recovery."); try { - if (replicaType == Replica.Type.APPEND) { + if (replicaType == Replica.Type.TLOG) { zkController.startReplicationFromLeader(coreName, true); } zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE); @@ -753,7 +753,7 @@ final private Future replay(SolrCore core) if (testing_beforeReplayBufferingUpdates != null) { testing_beforeReplayBufferingUpdates.run(); } - if (replicaType == Replica.Type.APPEND) { + if (replicaType == Replica.Type.TLOG) { // roll over all updates during buffering to new tlog, make RTG available SolrQueryRequest req = new LocalSolrQueryRequest(core, new ModifiableSolrParams()); diff --git a/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java b/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java index 367ee8671479..2fa847c2448b 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java @@ -181,10 +181,10 @@ public void call(ClusterState state, ZkNodeProps message, NamedList results) thr propMap.put(COLLECTION_PROP, restoreCollectionName); propMap.put(SHARD_ID_PROP, slice.getName()); - if (restoreCollection.getNumRealtimeReplicas() != null && restoreCollection.getNumRealtimeReplicas() >= 1) { - propMap.put(REPLICA_TYPE, Replica.Type.REALTIME.name()); - } else if (restoreCollection.getNumAppendReplicas() != null && restoreCollection.getNumAppendReplicas() >= 1) { - propMap.put(REPLICA_TYPE, Replica.Type.APPEND.name()); + if (restoreCollection.getNumNrtReplicas() != null && restoreCollection.getNumNrtReplicas() >= 1) { + propMap.put(REPLICA_TYPE, Replica.Type.NRT.name()); + } else if (restoreCollection.getNumTlogReplicas() != null && restoreCollection.getNumTlogReplicas() >= 1) { + propMap.put(REPLICA_TYPE, Replica.Type.TLOG.name()); } // add async param if (asyncId != null) { @@ -225,27 +225,27 @@ public void call(ClusterState state, ZkNodeProps message, NamedList results) thr restoreCollection = zkStateReader.getClusterState().getCollection(restoreCollectionName); //Add the remaining replicas for each shard, considering it's type - int numRealtimeReplicas = restoreCollection.getNumRealtimeReplicas() != null? - restoreCollection.getNumRealtimeReplicas():0; - if (numRealtimeReplicas == 0) { - numRealtimeReplicas = restoreCollection.getReplicationFactor() != null? + int numNrtReplicas = restoreCollection.getNumNrtReplicas() != null? + restoreCollection.getNumNrtReplicas():0; + if (numNrtReplicas == 0) { + numNrtReplicas = restoreCollection.getReplicationFactor() != null? restoreCollection.getReplicationFactor():0; } - int numAppendReplicas = restoreCollection.getNumAppendReplicas() != null? - restoreCollection.getNumAppendReplicas():0; - int numPassiveReplicas = restoreCollection.getNumPassiveReplicas() != null? - restoreCollection.getNumPassiveReplicas():0; + int numTlogReplicas = restoreCollection.getNumTlogReplicas() != null? + restoreCollection.getNumTlogReplicas():0; + int numPullReplicas = restoreCollection.getNumPullReplicas() != null? + restoreCollection.getNumPullReplicas():0; - int createdRealtimeReplicas = 0, createdAppendReplicas = 0, createdPassiveReplicas = 0; + int createdNrtReplicas = 0, createdTlogReplicas = 0, createdPullReplicas = 0; - // We already created either a REALTIME or an APPEND replica as leader - if (numRealtimeReplicas > 0) { - createdRealtimeReplicas++; - } else if (createdAppendReplicas > 0) { - createdAppendReplicas++; + // We already created either a REALTIME or an TLOG replica as leader + if (numNrtReplicas > 0) { + createdNrtReplicas++; + } else if (createdTlogReplicas > 0) { + createdTlogReplicas++; } - int totalReplicasPerShard = numRealtimeReplicas + numAppendReplicas + numPassiveReplicas; + int totalReplicasPerShard = numNrtReplicas + numTlogReplicas + numPullReplicas; if (totalReplicasPerShard > 1) { log.info("Adding replicas to restored collection={}", restoreCollection); @@ -253,15 +253,15 @@ public void call(ClusterState state, ZkNodeProps message, NamedList results) thr for (Slice slice : restoreCollection.getSlices()) { for (int i = 1; i < totalReplicasPerShard; i++) { Replica.Type typeToCreate; - if (createdRealtimeReplicas < numRealtimeReplicas) { - createdRealtimeReplicas++; - typeToCreate = Replica.Type.REALTIME; - } else if (createdAppendReplicas < numAppendReplicas) { - createdAppendReplicas++; - typeToCreate = Replica.Type.APPEND; + if (createdNrtReplicas < numNrtReplicas) { + createdNrtReplicas++; + typeToCreate = Replica.Type.NRT; + } else if (createdTlogReplicas < numTlogReplicas) { + createdTlogReplicas++; + typeToCreate = Replica.Type.TLOG; } else { - createdPassiveReplicas++; - typeToCreate = Replica.Type.PASSIVE; + createdPullReplicas++; + typeToCreate = Replica.Type.PULL; } log.debug("Adding replica for shard={} collection={} of type {} ", slice.getName(), restoreCollection, typeToCreate); diff --git a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java index b9494edd4995..fe9545823bed 100644 --- a/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/SplitShardCmd.java @@ -205,7 +205,7 @@ public boolean split(ClusterState clusterState, ZkNodeProps message, NamedList r for (int i = 0; i < subRanges.size(); i++) { String subSlice = slice + "_" + i; subSlices.add(subSlice); - String subShardName = Assign.buildCoreName(collectionName, subSlice, Replica.Type.REALTIME, 1); + String subShardName = Assign.buildCoreName(collectionName, subSlice, Replica.Type.NRT, 1); subShardNames.add(subShardName); } diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 02133d669cae..53d67471b734 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -889,13 +889,13 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov joinAtHead = replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false); } //TODO WHy would replica be null? - if (replica == null || replica.getType() != Type.PASSIVE) { + if (replica == null || replica.getType() != Type.PULL) { joinElection(desc, afterExpiration, joinAtHead); - } else if (replica.getType() == Type.PASSIVE) { + } else if (replica.getType() == Type.PULL) { if (joinAtHead) { - log.warn("Replica {} was designated as preferred leader but it's type is {}, It won't join election", coreZkNodeName, Type.PASSIVE); + log.warn("Replica {} was designated as preferred leader but it's type is {}, It won't join election", coreZkNodeName, Type.PULL); } - log.debug("Replica {} skipping election because it's type is {}", coreZkNodeName, Type.PASSIVE); + log.debug("Replica {} skipping election because it's type is {}", coreZkNodeName, Type.PULL); startReplicationFromLeader(coreName, false); } } catch (InterruptedException e) { @@ -915,7 +915,7 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov log.debug("We are " + ourUrl + " and leader is " + leaderUrl); boolean isLeader = leaderUrl.equals(ourUrl); Replica.Type replicaType = zkStateReader.getClusterState().getCollection(collection).getReplica(coreZkNodeName).getType(); - assert !(isLeader && replicaType == Type.PASSIVE): "Passive replica became leader!"; + assert !(isLeader && replicaType == Type.PULL): "Pull replica became leader!"; try (SolrCore core = cc.getCore(desc.getName())) { @@ -926,15 +926,15 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov // leader election perhaps? UpdateLog ulog = core.getUpdateHandler().getUpdateLog(); - boolean isAppendAndNotLeader = replicaType == Replica.Type.APPEND && !isLeader; - if (isAppendAndNotLeader) { + boolean isTlogReplicaAndNotLeader = replicaType == Replica.Type.TLOG && !isLeader; + if (isTlogReplicaAndNotLeader) { String commitVersion = ReplicateFromLeader.getCommitVersion(core); if (commitVersion != null) { ulog.copyOverOldUpdates(Long.parseLong(commitVersion)); } } // we will call register again after zk expiration and on reload - if (!afterExpiration && !core.isReloaded() && ulog != null && !isAppendAndNotLeader) { + if (!afterExpiration && !core.isReloaded() && ulog != null && !isTlogReplicaAndNotLeader) { // disable recovery in case shard is in construction state (for shard splits) Slice slice = getClusterState().getSlice(collection, shardId); if (slice.getState() != Slice.State.CONSTRUCTION || !isLeader) { @@ -953,7 +953,7 @@ public String register(String coreName, final CoreDescriptor desc, boolean recov boolean didRecovery = checkRecovery(recoverReloadedCores, isLeader, skipRecovery, collection, coreZkNodeName, core, cc, afterExpiration); if (!didRecovery) { - if (isAppendAndNotLeader) { + if (isTlogReplicaAndNotLeader) { startReplicationFromLeader(coreName, true); } publish(desc, Replica.State.ACTIVE); @@ -1210,7 +1210,7 @@ public void publish(final CoreDescriptor cd, final Replica.State state, boolean if (state != Replica.State.DOWN) { final Replica.State lirState = getLeaderInitiatedRecoveryState(collection, shardId, coreNodeName); if (lirState != null) { - assert cd.getCloudDescriptor().getReplicaType() != Replica.Type.PASSIVE: "LIR should not happen for passive replicas!"; + assert cd.getCloudDescriptor().getReplicaType() != Replica.Type.PULL: "LIR should not happen for pull replicas!"; if (state == Replica.State.ACTIVE) { // trying to become active, so leader-initiated state must be recovering if (lirState == Replica.State.RECOVERING) { @@ -1295,14 +1295,14 @@ public void unregister(String coreName, CoreDescriptor cd) throws InterruptedExc } Replica replica = zkStateReader.getClusterState().getReplica(collection, coreNodeName); - if (replica == null || replica.getType() != Type.PASSIVE) { + if (replica == null || replica.getType() != Type.PULL) { ElectionContext context = electionContexts.remove(new ContextKey(collection, coreNodeName)); if (context != null) { context.cancelElection(); } } -// //TODO: Do we need to stop replication for type==append? +// //TODO: Do we need to stop replication for type==tlog? CloudDescriptor cloudDescriptor = cd.getCloudDescriptor(); zkStateReader.unregisterCore(cloudDescriptor.getCollectionName()); diff --git a/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java b/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java index ebe29f1128f1..506e158332da 100644 --- a/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java +++ b/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java @@ -190,7 +190,7 @@ private Map tryAllPermutations(List shardNames, List positions = new ArrayList<>(); for (int pos : p) { for (int j = 0; j < shardVsReplicaCount.get(shardNames.get(pos)); j++) { - positions.add(new Position(shardNames.get(pos), j, Replica.Type.REALTIME)); + positions.add(new Position(shardNames.get(pos), j, Replica.Type.NRT)); } } Collections.sort(positions); diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index 773c50c6614a..4d15e7730e41 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -1189,7 +1189,7 @@ public void reload(String name) { DocCollection docCollection = getZkController().getClusterState().getCollection(cd.getCollectionName()); Replica replica = docCollection.getReplica(cd.getCloudDescriptor().getCoreNodeName()); assert replica != null; - if (replica.getType() == Replica.Type.APPEND) { //TODO: needed here? + if (replica.getType() == Replica.Type.TLOG) { //TODO: needed here? getZkController().stopReplicationFromLeader(core.getName()); if (!cd.getCloudDescriptor().isLeader()) { getZkController().startReplicationFromLeader(newCore.getName(), true); @@ -1280,9 +1280,9 @@ public void unload(String name, boolean deleteIndexDir, boolean deleteDataDir, b if (zkSys.getZkController() != null) { // cancel recovery in cloud mode core.getSolrCoreState().cancelRecovery(); - if (core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.PASSIVE - || core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.APPEND) { - // Stop replication if this is part of a passive replica before closing the code + if (core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.PULL + || core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.TLOG) { + // Stop replication if this is part of a pull replica before closing the code zkSys.getZkController().stopReplicationFromLeader(name); } } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index e657b62b9e3f..4a5cb83f1c99 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -399,9 +399,9 @@ enum CollectionOperation implements CollectionOp { AUTO_ADD_REPLICAS, RULE, SNITCH, - PASSIVE_REPLICAS, - APPEND_REPLICAS, - REALTIME_REPLICAS); + PULL_REPLICAS, + TLOG_REPLICAS, + NRT_REPLICAS); if (props.get(STATE_FORMAT) == null) { props.put(STATE_FORMAT, "2"); diff --git a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java index f2d263975582..a2108be7001f 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java @@ -350,11 +350,11 @@ public void prepDistributed(ResponseBuilder rb) { String ourSlice = cloudDescriptor.getShardId(); String ourCollection = cloudDescriptor.getCollectionName(); // Some requests may only be fulfilled by replicas of type Replica.Type.REALTIME - boolean onlyRealtimeReplicas = Boolean.TRUE == req.getContext().get("distribOnlyRealtime"); + boolean onlyNrtReplicas = Boolean.TRUE == req.getContext().get("distribOnlyRealtime"); if (rb.slices.length == 1 && rb.slices[0] != null && ( rb.slices[0].equals(ourSlice) || rb.slices[0].equals(ourCollection + "_" + ourSlice) ) // handle the _ format && cloudDescriptor.getLastPublished() == Replica.State.ACTIVE - && (!onlyRealtimeReplicas || cloudDescriptor.getReplicaType() == Replica.Type.REALTIME)) { + && (!onlyNrtReplicas || cloudDescriptor.getReplicaType() == Replica.Type.NRT)) { boolean shortCircuit = params.getBool("shortCircuit", true); // currently just a debugging parameter to check distrib search on a single node String targetHandler = params.get(ShardParams.SHARDS_QT); @@ -397,11 +397,11 @@ public void prepDistributed(ResponseBuilder rb) { for (Replica replica : allSliceReplicas) { if (!clusterState.liveNodesContain(replica.getNodeName()) || replica.getState() != Replica.State.ACTIVE - || (onlyRealtimeReplicas && replica.getType() == Replica.Type.PASSIVE)) { + || (onlyNrtReplicas && replica.getType() == Replica.Type.PULL)) { continue; } - if (onlyRealtimeReplicas && replica.getType() == Replica.Type.APPEND) { + if (onlyNrtReplicas && replica.getType() == Replica.Type.TLOG) { if (shardLeader == null) { try { shardLeader = zkController.getZkStateReader().getLeaderRetry(cloudDescriptor.getCollectionName(), slice.getName()); diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java index 18e202ecfcce..676070586dd7 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java @@ -111,11 +111,11 @@ public void process(ResponseBuilder rb) throws IOException if (cloudDesc != null) { Replica.Type replicaType = cloudDesc.getReplicaType(); if (replicaType != null) { - if (replicaType == Replica.Type.PASSIVE) { + if (replicaType == Replica.Type.PULL) { throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT, "%s can't handle realtime get requests. Replicas of type %s do not support these type of requests", cloudDesc.getCoreNodeName(), - Replica.Type.PASSIVE)); + Replica.Type.PULL)); } // non-leader APPEND replicas should not respond to distrib /get requests, but internal requests are OK } diff --git a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java index 866510970094..e4811091c734 100644 --- a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java +++ b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java @@ -122,7 +122,7 @@ public DirectUpdateHandler2(SolrCore core) { indexWriterCloseWaitsForMerges = updateHandlerInfo.indexWriterCloseWaitsForMerges; ZkController zkController = core.getCoreContainer().getZkController(); - if (zkController != null && core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.APPEND) { + if (zkController != null && core.getCoreDescriptor().getCloudDescriptor().getReplicaType() == Replica.Type.TLOG) { commitWithinSoftCommit = false; commitTracker.setOpenSearcher(true); } diff --git a/solr/core/src/java/org/apache/solr/update/UpdateHandler.java b/solr/core/src/java/org/apache/solr/update/UpdateHandler.java index 5c6e33b82dce..42abaf8f9e04 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateHandler.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateHandler.java @@ -124,7 +124,7 @@ public UpdateHandler(SolrCore core, UpdateLog updateLog) { PluginInfo ulogPluginInfo = core.getSolrConfig().getPluginInfo(UpdateLog.class.getName()); - // If this is a replica of type passive, don't create the update log + // If this is a replica of type PULL, don't create the update log boolean skipUpdateLog = core.getCoreDescriptor().getCloudDescriptor() != null && !core.getCoreDescriptor().getCloudDescriptor().requiresTransactionLog(); if (updateLog == null && ulogPluginInfo != null && !skipUpdateLog) { String dataDir = (String)ulogPluginInfo.initArgs.get("dir"); diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java b/solr/core/src/java/org/apache/solr/update/UpdateLog.java index ddacb1971b43..87b93f4a5177 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java @@ -1092,7 +1092,7 @@ public Future recoverFromLog() { /** * Replay current tlog, so all updates will be written to index. - * This is must do task for a append replica become a new leader. + * This is must do task for a tlog replica become a new leader. * @return future of this task */ public Future recoverFromCurrentLog() { diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java index 52cb034f5709..e67f98219fc5 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java @@ -328,7 +328,7 @@ public DistributedUpdateProcessor(SolrQueryRequest req, replicaType = cloudDesc.getReplicaType(); } else { collection = null; - replicaType = Replica.Type.REALTIME; + replicaType = Replica.Type.NRT; } boolean shouldClone = false; @@ -679,7 +679,7 @@ private List setupRequestForDBQ() { forwardToLeader = false; List replicaProps = zkController.getZkStateReader() - .getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.REALTIME, Replica.Type.APPEND)); + .getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG)); if (replicaProps != null) { nodes = new ArrayList<>(replicaProps.size()); for (ZkCoreNodeProps props : replicaProps) { @@ -1189,7 +1189,7 @@ protected boolean versionAdd(AddUpdateCommand cmd) throws IOException { checkDeleteByQueries = true; } } - if (replicaType == Replica.Type.APPEND && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (replicaType == Replica.Type.TLOG && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { cmd.setFlags(cmd.getFlags() | UpdateCommand.IGNORE_INDEXWRITER); } } @@ -1611,7 +1611,7 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { collection, myShardId); // DBQ forwarded to Realtime and Append List replicaProps = zkController.getZkStateReader() - .getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.REALTIME, Replica.Type.APPEND)); + .getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG)); if (replicaProps != null) { final List myReplicas = new ArrayList<>(replicaProps.size()); for (ZkCoreNodeProps replicaProp : replicaProps) { @@ -1699,7 +1699,7 @@ protected void versionDeleteByQuery(DeleteUpdateCommand cmd) throws IOException return; } - if (replicaType == Replica.Type.APPEND && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (replicaType == Replica.Type.TLOG && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { // Append replica not leader, don't write the DBQ to IW cmd.setFlags(cmd.getFlags() | UpdateCommand.IGNORE_INDEXWRITER); } @@ -1857,7 +1857,7 @@ protected boolean versionDelete(DeleteUpdateCommand cmd) throws IOException { } } - if (replicaType == Replica.Type.APPEND && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { + if (replicaType == Replica.Type.TLOG && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { cmd.setFlags(cmd.getFlags() | UpdateCommand.IGNORE_INDEXWRITER); } } @@ -1884,11 +1884,11 @@ public void processCommit(CommitUpdateCommand cmd) throws IOException { zkCheck(); nodes = getCollectionUrls(req, req.getCore().getCoreDescriptor() - .getCloudDescriptor().getCollectionName(), EnumSet.of(Replica.Type.APPEND,Replica.Type.REALTIME)); + .getCloudDescriptor().getCollectionName(), EnumSet.of(Replica.Type.TLOG,Replica.Type.NRT)); if (nodes == null) { - // This could happen if there are only passive replicas + // This could happen if there are only pull replicas throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, - "Unable to distribute commit operation. No replicas available of types " + Replica.Type.APPEND + " or " + Replica.Type.REALTIME); + "Unable to distribute commit operation. No replicas available of types " + Replica.Type.TLOG + " or " + Replica.Type.NRT); } if (isLeader && nodes.size() == 1) { singleLeader = true; @@ -1896,7 +1896,7 @@ public void processCommit(CommitUpdateCommand cmd) throws IOException { } if (!zkEnabled || req.getParams().getBool(COMMIT_END_POINT, false) || singleLeader) { - if (replicaType == Replica.Type.APPEND) { // REALTIME will always commit + if (replicaType == Replica.Type.TLOG) { // REALTIME will always commit try { Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry( collection, cloudDesc.getShardId()); diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java index f3eb4841b390..7b3015194612 100644 --- a/solr/core/src/java/org/apache/solr/util/TestInjection.java +++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java @@ -388,7 +388,7 @@ public static boolean waitForInSyncWithLeader(SolrCore core, ZkController zkCont String localVersion = searcher.get().getIndexReader().getIndexCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); if (localVersion == null && leaderVersion == 0 && !core.getUpdateHandler().getUpdateLog().hasUncommittedChanges()) return true; if (localVersion != null && Long.parseLong(localVersion) == leaderVersion && (leaderVersion >= t || i >= 6)) { - log.info("Waiting time for append replica to be in sync with leader: {}", System.currentTimeMillis()-currentTime); + log.info("Waiting time for tlog replica to be in sync with leader: {}", System.currentTimeMillis()-currentTime); return true; } else { log.debug("Append replica not in sync with leader yet. Attempt: {}", i); diff --git a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java index d651e68620b1..e488c53e53f3 100644 --- a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java @@ -82,12 +82,12 @@ public void test() throws Exception { boolean isImplicit = random().nextBoolean(); boolean doSplitShardOperation = !isImplicit && random().nextBoolean(); int replFactor = TestUtil.nextInt(random(), 1, 2); - int numAppendReplicas = TestUtil.nextInt(random(), 0, 1); - int numPassiveReplicas = TestUtil.nextInt(random(), 0, 1); + int numTlogReplicas = TestUtil.nextInt(random(), 0, 1); + int numPullReplicas = TestUtil.nextInt(random(), 0, 1); CollectionAdminRequest.Create create = - CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor, numAppendReplicas, numPassiveReplicas); - if (NUM_SHARDS * (replFactor + numAppendReplicas + numPassiveReplicas) > cluster.getJettySolrRunners().size() || random().nextBoolean()) { - create.setMaxShardsPerNode((int)Math.ceil(NUM_SHARDS * (replFactor + numAppendReplicas + numPassiveReplicas) / cluster.getJettySolrRunners().size()));//just to assert it survives the restoration + CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor, numTlogReplicas, numPullReplicas); + if (NUM_SHARDS * (replFactor + numTlogReplicas + numPullReplicas) > cluster.getJettySolrRunners().size() || random().nextBoolean()) { + create.setMaxShardsPerNode((int)Math.ceil(NUM_SHARDS * (replFactor + numTlogReplicas + numPullReplicas) / cluster.getJettySolrRunners().size()));//just to assert it survives the restoration if (doSplitShardOperation) { create.setMaxShardsPerNode(create.getMaxShardsPerNode() * 2); } diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java index 51f9fe9301fd..c8e92fc14f47 100644 --- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java +++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java @@ -62,7 +62,7 @@ public BasicDistributedZk2Test() { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java index 162164b01424..18caa5858c2e 100644 --- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java @@ -119,7 +119,7 @@ public BasicDistributedZkTest() { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java index 0130dc822dc0..2e3152015614 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeTest.java @@ -98,7 +98,7 @@ public ChaosMonkeyNothingIsSafeTest() { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPullReplicasTest.java similarity index 90% rename from solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java rename to solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPullReplicasTest.java index 94d6e4557ad2..8c4c781eb661 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeyNothingIsSafeWithPullReplicasTest.java @@ -50,20 +50,20 @@ @SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") @ThreadLeakLingering(linger = 60000) @SuppressObjectReleaseTracker(bugUrl="Testing purposes") -public class ChaosMonkeyNothingIsSafeWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { +public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDistribZkTestBase { private static final int FAIL_TOLERANCE = 100; private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final Integer RUN_LENGTH = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.runlength", "-1")); - private final boolean useAppendReplicas = random().nextBoolean(); + private final boolean useTlogReplicas = random().nextBoolean(); - private final int numPassiveReplicas; - private final int numRealtimeOrAppendReplicas; + private final int numPullReplicas; + private final int numRealtimeOrTlogReplicas; - protected int getPassiveReplicaCount() { - return numPassiveReplicas; + protected int getPullReplicaCount() { + return numPullReplicas; } @BeforeClass @@ -102,16 +102,16 @@ public void distribSetUp() throws Exception { useFactory("solr.StandardDirectoryFactory"); } - public ChaosMonkeyNothingIsSafeWithPassiveReplicasTest() { + public ChaosMonkeyNothingIsSafeWithPullReplicasTest() { super(); - numPassiveReplicas = random().nextInt(TEST_NIGHTLY ? 2 : 1) + 1; - numRealtimeOrAppendReplicas = random().nextInt(TEST_NIGHTLY ? 4 : 3) + 1; + numPullReplicas = random().nextInt(TEST_NIGHTLY ? 2 : 1) + 1; + numRealtimeOrTlogReplicas = random().nextInt(TEST_NIGHTLY ? 4 : 3) + 1; sliceCount = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.slicecount", "-1")); if (sliceCount == -1) { sliceCount = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1; } - int numNodes = sliceCount * (numRealtimeOrAppendReplicas + numPassiveReplicas); + int numNodes = sliceCount * (numRealtimeOrTlogReplicas + numPullReplicas); fixShardCount(numNodes); log.info("Starting ChaosMonkey test with {} shards and {} nodes", sliceCount, numNodes); @@ -121,8 +121,8 @@ public ChaosMonkeyNothingIsSafeWithPassiveReplicasTest() { } @Override - protected boolean useAppendReplicas() { - return useAppendReplicas; + protected boolean useTlogReplicas() { + return useTlogReplicas; } @Test @@ -132,10 +132,10 @@ public void test() throws Exception { assertEquals(this.sliceCount, docCollection.getSlices().size()); Slice s = docCollection.getSlice("shard1"); assertNotNull(s); - assertEquals("Unexpected number of replicas. Collection: " + docCollection, numRealtimeOrAppendReplicas + numPassiveReplicas, s.getReplicas().size()); - assertEquals("Unexpected number of passive replicas. Collection: " + docCollection, numPassiveReplicas, s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); - assertEquals(useAppendReplicas()?0:numRealtimeOrAppendReplicas, s.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); - assertEquals(useAppendReplicas()?numRealtimeOrAppendReplicas:0, s.getReplicas(EnumSet.of(Replica.Type.APPEND)).size()); + assertEquals("Unexpected number of replicas. Collection: " + docCollection, numRealtimeOrTlogReplicas + numPullReplicas, s.getReplicas().size()); + assertEquals("Unexpected number of pull replicas. Collection: " + docCollection, numPullReplicas, s.getReplicas(EnumSet.of(Replica.Type.PULL)).size()); + assertEquals(useTlogReplicas()?0:numRealtimeOrTlogReplicas, s.getReplicas(EnumSet.of(Replica.Type.NRT)).size()); + assertEquals(useTlogReplicas()?numRealtimeOrTlogReplicas:0, s.getReplicas(EnumSet.of(Replica.Type.TLOG)).size()); boolean testSuccessful = false; try { @@ -287,7 +287,7 @@ public void test() throws Exception { } List numShardsNumReplicas = new ArrayList<>(2); numShardsNumReplicas.add(1); - numShardsNumReplicas.add(1 + getPassiveReplicaCount()); + numShardsNumReplicas.add(1 + getPullReplicaCount()); checkForCollection("testcollection", numShardsNumReplicas, null); testSuccessful = true; diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPullReplicasTest.java similarity index 85% rename from solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java rename to solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPullReplicasTest.java index 8639fba0771b..62d77a65580a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPassiveReplicasTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkeySafeLeaderWithPullReplicasTest.java @@ -41,24 +41,24 @@ @Slow @SuppressObjectReleaseTracker(bugUrl="Testing purposes") -public class ChaosMonkeySafeLeaderWithPassiveReplicasTest extends AbstractFullDistribZkTestBase { +public class ChaosMonkeySafeLeaderWithPullReplicasTest extends AbstractFullDistribZkTestBase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final Integer RUN_LENGTH = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.runlength", "-1")); - private final boolean useAppendReplicas = random().nextBoolean(); + private final boolean useTlogReplicas = random().nextBoolean(); - private final int numPassiveReplicas; - private final int numRealtimeOrAppendReplicas; + private final int numPullReplicas; + private final int numRealtimeOrTlogReplicas; @Override - protected int getPassiveReplicaCount() { - return numPassiveReplicas; + protected int getPullReplicaCount() { + return numPullReplicas; } @Override - protected boolean useAppendReplicas() { - return useAppendReplicas; + protected boolean useTlogReplicas() { + return useTlogReplicas; } @BeforeClass @@ -93,16 +93,16 @@ public void distribSetUp() throws Exception { super.distribSetUp(); } - public ChaosMonkeySafeLeaderWithPassiveReplicasTest() { + public ChaosMonkeySafeLeaderWithPullReplicasTest() { super(); - numPassiveReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; - numRealtimeOrAppendReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; + numPullReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; + numRealtimeOrTlogReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;; sliceCount = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.slicecount", "-1")); if (sliceCount == -1) { sliceCount = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1; } - int numNodes = sliceCount * (numRealtimeOrAppendReplicas + numPassiveReplicas); + int numNodes = sliceCount * (numRealtimeOrTlogReplicas + numPullReplicas); fixShardCount(numNodes); log.info("Starting ChaosMonkey test with {} shards and {} nodes", sliceCount, numNodes); } @@ -113,10 +113,10 @@ public void test() throws Exception { assertEquals(this.sliceCount, docCollection.getSlices().size()); Slice s = docCollection.getSlice("shard1"); assertNotNull(s); - assertEquals("Unexpected number of replicas. Collection: " + docCollection, numRealtimeOrAppendReplicas + numPassiveReplicas, s.getReplicas().size()); - assertEquals("Unexpected number of passive replicas. Collection: " + docCollection, numPassiveReplicas, s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); - assertEquals(useAppendReplicas()?0:numRealtimeOrAppendReplicas, s.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); - assertEquals(useAppendReplicas()?numRealtimeOrAppendReplicas:0, s.getReplicas(EnumSet.of(Replica.Type.APPEND)).size()); + assertEquals("Unexpected number of replicas. Collection: " + docCollection, numRealtimeOrTlogReplicas + numPullReplicas, s.getReplicas().size()); + assertEquals("Unexpected number of pull replicas. Collection: " + docCollection, numPullReplicas, s.getReplicas(EnumSet.of(Replica.Type.PULL)).size()); + assertEquals(useTlogReplicas()?0:numRealtimeOrTlogReplicas, s.getReplicas(EnumSet.of(Replica.Type.NRT)).size()); + assertEquals(useTlogReplicas()?numRealtimeOrTlogReplicas:0, s.getReplicas(EnumSet.of(Replica.Type.TLOG)).size()); handle.clear(); handle.put("timestamp", SKIPVAL); @@ -220,7 +220,7 @@ public void test() throws Exception { } List numShardsNumReplicas = new ArrayList<>(2); numShardsNumReplicas.add(1); - numShardsNumReplicas.add(1 + getPassiveReplicaCount()); + numShardsNumReplicas.add(1 + getPullReplicaCount()); checkForCollection("testcollection",numShardsNumReplicas, null); } diff --git a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java index e75a8547fdc5..ea8598bd0b79 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPIDistributedZkTest.java @@ -286,7 +286,7 @@ public void testCreateShouldFailOnExistingCore() throws Exception { // first we make a core with the core name the collections api // will try and use - this will cause our mock fail Create createCmd = new Create(); - createCmd.setCoreName(Assign.buildCoreName("halfcollection", "shard1", Replica.Type.REALTIME, 1)); + createCmd.setCoreName(Assign.buildCoreName("halfcollection", "shard1", Replica.Type.NRT, 1)); createCmd.setCollection("halfcollectionblocker"); String dataDir = createTempDir().toFile().getAbsolutePath(); createCmd.setDataDir(dataDir); @@ -298,7 +298,7 @@ public void testCreateShouldFailOnExistingCore() throws Exception { } createCmd = new Create(); - createCmd.setCoreName(Assign.buildCoreName("halfcollection", "shard1", Replica.Type.REALTIME, 1)); + createCmd.setCoreName(Assign.buildCoreName("halfcollection", "shard1", Replica.Type.NRT, 1)); createCmd.setCollection("halfcollectionblocker2"); dataDir = createTempDir().toFile().getAbsolutePath(); createCmd.setDataDir(dataDir); diff --git a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java index 2a2da781abe0..643660bc2937 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java @@ -67,7 +67,7 @@ public void testCreateAndDeleteCollection() throws Exception { Map> coresStatus = response.getCollectionCoresStatus(); assertEquals(4, coresStatus.size()); for (int i=0; i<4; i++) { - NamedList status = coresStatus.get(Assign.buildCoreName(collectionName, "shard" + (i/2+1), Replica.Type.REALTIME, (i%2+1))); + NamedList status = coresStatus.get(Assign.buildCoreName(collectionName, "shard" + (i/2+1), Replica.Type.NRT, (i%2+1))); assertEquals(0, (int)status.get("status")); assertTrue(status.get("QTime") > 0); } @@ -136,9 +136,9 @@ public void testCreateAndDeleteShard() throws IOException, SolrServerException { assertTrue(response.isSuccess()); coresStatus = response.getCollectionCoresStatus(); assertEquals(3, coresStatus.size()); - assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.REALTIME, 1)).get("status")); - assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.APPEND, 1)).get("status")); - assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.PASSIVE, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.NRT, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.TLOG, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shardC", Replica.Type.PULL, 1)).get("status")); response = CollectionAdminRequest.deleteShard(collectionName, "shardC").process(cluster.getSolrClient()); @@ -176,8 +176,8 @@ public void testSplitShard() throws Exception { assertEquals(0, response.getStatus()); assertTrue(response.isSuccess()); Map> coresStatus = response.getCollectionCoresStatus(); - assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shard1_0" , Replica.Type.REALTIME, 1)).get("status")); - assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shard1_1" , Replica.Type.REALTIME, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shard1_0" , Replica.Type.NRT, 1)).get("status")); + assertEquals(0, (int) coresStatus.get(Assign.buildCoreName(collectionName, "shard1_1" , Replica.Type.NRT, 1)).get("status")); waitForState("Expected all shards to be active and parent shard to be removed", collectionName, (n, c) -> { if (c.getSlice("shard1").getState() == Slice.State.ACTIVE) diff --git a/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java b/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java index db9ecb4769d7..8f35c888a3c6 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ForceLeaderTest.java @@ -58,7 +58,7 @@ public class ForceLeaderTest extends HttpPartitionTest { private final boolean onlyLeaderIndexes = random().nextBoolean(); @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } diff --git a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java index 840679148871..2cc1c3053fa1 100644 --- a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java @@ -85,7 +85,7 @@ public HttpPartitionTest() { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java index 199091952d0e..f3965acfccdf 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java @@ -46,7 +46,7 @@ public LeaderInitiatedRecoveryOnCommitTest() { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java index 340adbbbcb26..91da2c17d09f 100644 --- a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java @@ -347,7 +347,7 @@ protected void verifySubmitCaptures( assertEquals(numberOfSlices * numberOfReplica, coreNames.size()); for (int i = 1; i <= numberOfSlices; i++) { for (int j = 1; j <= numberOfReplica; j++) { - String coreName = Assign.buildCoreName(COLLECTION_NAME, "shard" + i, Replica.Type.REALTIME, j); + String coreName = Assign.buildCoreName(COLLECTION_NAME, "shard" + i, Replica.Type.NRT, j); assertTrue("Shard " + coreName + " was not created", coreNames.contains(coreName)); diff --git a/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java b/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java index eabd9b03c493..8290e122f422 100644 --- a/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/RecoveryAfterSoftCommitTest.java @@ -40,7 +40,7 @@ public RecoveryAfterSoftCommitTest() { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } diff --git a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java index e00ea0d639ae..73a0bf795edf 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java @@ -81,7 +81,7 @@ public ShardSplitTest() { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return false; } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java index 965c169718f9..c7fc0e8fab61 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java @@ -110,7 +110,7 @@ public void leaderRecoverFromLogOnStartupTest() throws Exception { assertEquals(4, resp.getResults().getNumFound()); // Make sure all nodes is recover from tlog if (onlyLeaderIndexes) { - // Leader election can be kicked off, so 2 append replicas will replay its tlog before becoming new leader + // Leader election can be kicked off, so 2 tlog replicas will replay its tlog before becoming new leader assertTrue( countReplayLog.get() >=2); } else { assertEquals(4, countReplayLog.get()); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java b/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java index 68509b86cad3..74ad7bd62b21 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCollectionAPI.java @@ -61,7 +61,7 @@ public TestCollectionAPI() { public void test() throws Exception { try (CloudSolrClient client = createCloudClient(null)) { CollectionAdminRequest.Create req; - if (useAppendReplicas()) { + if (useTlogReplicas()) { req = CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf1",2, 0, 1, 1); } else { req = CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf1",2, 1, 0, 1); @@ -177,7 +177,7 @@ private void clusterStatusWithCollection() throws IOException, SolrServerExcepti Map collection = (Map) collections.get(COLLECTION_NAME); assertNotNull(collection); assertEquals("conf1", collection.get("configName")); -// assertEquals("1", collection.get("realtimeReplicas")); +// assertEquals("1", collection.get("nrtReplicas")); } } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java similarity index 83% rename from solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java rename to solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java index c9cacbbd751b..fa0578b28ce8 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java @@ -60,7 +60,7 @@ import com.carrotsearch.randomizedtesting.annotations.Repeat; @Slow -public class TestPassiveReplica extends SolrCloudTestCase { +public class TestPullReplica extends SolrCloudTestCase { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -113,28 +113,6 @@ public void tearDown() throws Exception { super.tearDown(); } - // Just to compare test time, nocommit - @Ignore - public void testCreateDelete2() throws Exception { - try { - CollectionAdminRequest.createCollection(collectionName, "conf", 1, 8, 0, 0).process(cluster.getSolrClient()); - DocCollection docCollection = getCollectionState(collectionName); - assertNotNull(docCollection); -// assertEquals("Expecting 4 relpicas per shard", -// 8, docCollection.getReplicas().size()); -// assertEquals("Expecting 6 passive replicas, 3 per shard", -// 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); -// assertEquals("Expecting 2 writer replicas, one per shard", -// 2, docCollection.getReplicas(EnumSet.of(Replica.Type.WRITER)).size()); -// for (Slice s:docCollection.getSlices()) { -// // read-only replicas can never become leaders -// assertFalse(s.getLeader().isReadOnly()); -// } - } finally { - zkClient().printLayoutToStdOut(); - } - } - @Repeat(iterations=2) // 2 times to make sure cleanup is complete and we can create the same collection public void testCreateDelete() throws Exception { try { @@ -147,13 +125,13 @@ public void testCreateDelete() throws Exception { assertNotNull(docCollection); assertEquals("Expecting 4 relpicas per shard", 8, docCollection.getReplicas().size()); - assertEquals("Expecting 6 passive replicas, 3 per shard", - 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); + assertEquals("Expecting 6 pull replicas, 3 per shard", + 6, docCollection.getReplicas(EnumSet.of(Replica.Type.PULL)).size()); assertEquals("Expecting 2 writer replicas, one per shard", - 2, docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); + 2, docCollection.getReplicas(EnumSet.of(Replica.Type.NRT)).size()); for (Slice s:docCollection.getSlices()) { // read-only replicas can never become leaders - assertFalse(s.getLeader().getType() == Replica.Type.PASSIVE); + assertFalse(s.getLeader().getType() == Replica.Type.PULL); List shardElectionNodes = cluster.getZkClient().getChildren(ZkStateReader.getShardLeadersElectPath(collectionName, s.getName()), null, true); assertEquals("Unexpected election nodes for Shard: " + s.getName() + ": " + Arrays.toString(shardElectionNodes.toArray()), 1, shardElectionNodes.size()); @@ -175,8 +153,8 @@ public void testCreateDelete() throws Exception { } /** - * Asserts that Update logs exist for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#REALTIME}, but not - * for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} + * Asserts that Update logs exist for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#NRT}, but not + * for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PULL} */ private void assertUlogPresence(DocCollection collection) { for (Slice s:collection.getSlices()) { @@ -185,7 +163,7 @@ private void assertUlogPresence(DocCollection collection) { try { core = cluster.getReplicaJetty(r).getCoreContainer().getCore(r.getCoreName()); assertNotNull(core); - assertEquals("Update log should not exist for replicas of type Passive", r.getType() == Replica.Type.REALTIME, + assertEquals("Update log should not exist for replicas of type Passive", r.getType() == Replica.Type.NRT, new java.io.File(core.getUlogDir()).exists()); } finally { core.close(); @@ -214,7 +192,7 @@ public void testAddDocs() throws Exception { } TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS); - for (Replica r:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE))) { + for (Replica r:s.getReplicas(EnumSet.of(Replica.Type.PULL))) { //TODO: assert replication < REPLICATION_TIMEOUT_SECS try (HttpSolrClient readOnlyReplicaClient = getHttpSolrClient(r.getCoreUrl())) { while (true) { @@ -241,7 +219,7 @@ public void testAddDocs() throws Exception { assertUlogPresence(docCollection); } - public void testAddRemovePassiveReplica() throws Exception { + public void testAddRemovePullReplica() throws Exception { CollectionAdminRequest.createCollection(collectionName, "conf", 2, 1, 0, 0) .setMaxShardsPerNode(100) .process(cluster.getSolrClient()); @@ -250,18 +228,18 @@ public void testAddRemovePassiveReplica() throws Exception { DocCollection docCollection = assertNumberOfReplicas(2, 0, 0, false, true); assertEquals(2, docCollection.getSlices().size()); - CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.PASSIVE).process(cluster.getSolrClient()); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.PULL).process(cluster.getSolrClient()); docCollection = assertNumberOfReplicas(2, 0, 1, true, false); - CollectionAdminRequest.addReplicaToShard(collectionName, "shard2", Replica.Type.PASSIVE).process(cluster.getSolrClient()); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard2", Replica.Type.PULL).process(cluster.getSolrClient()); docCollection = assertNumberOfReplicas(2, 0, 2, true, false); waitForState("Expecting collection to have 2 shards and 2 replica each", collectionName, clusterShape(2, 2)); - //Delete passive replica from shard1 + //Delete pull replica from shard1 CollectionAdminRequest.deleteReplica( collectionName, "shard1", - docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getName()) + docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getName()) .process(cluster.getSolrClient()); assertNumberOfReplicas(2, 0, 1, true, true); } @@ -275,8 +253,8 @@ public void testKillLeader() throws Exception { } @Ignore("Ignore until I figure out a way to reliably record state transitions") - public void testPassiveReplicaStates() throws Exception { - // Validate that passive replicas go through the correct states when starting, stopping, reconnecting + public void testPullReplicaStates() throws Exception { + // Validate that pull replicas go through the correct states when starting, stopping, reconnecting CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, 0) .setMaxShardsPerNode(100) .process(cluster.getSolrClient()); @@ -294,7 +272,7 @@ public void testPassiveReplicaStates() throws Exception { LOG.info("CollectionStateWatcher saw state: {}", r.getState()); return r.getState() == Replica.State.ACTIVE; }); - CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.PASSIVE).process(cluster.getSolrClient()); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.PULL).process(cluster.getSolrClient()); waitForState("Replica not added", collectionName, activeReplicaCount(1, 0, 1)); zkClient().printLayoutToStdOut(); LOG.info("Saw states: " + Arrays.toString(statesSeen.toArray())); @@ -366,7 +344,7 @@ private void doTestNoLeader(boolean removeReplica) throws Exception { assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); } - waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PULL))); // Delete leader replica from shard1 ignoreException("No registered leader was found"); //These are expected @@ -391,29 +369,29 @@ private void doTestNoLeader(boolean removeReplica) throws Exception { Replica leader = docCollection.getSlice("shard1").getLeader(); assertTrue(leader == null || !leader.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())); - // Passive replica on the other hand should be active - Replica passiveReplica = docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0); - assertTrue(passiveReplica.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())); + // Pull replica on the other hand should be active + Replica pullReplica = docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PULL)).get(0); + assertTrue(pullReplica.isActive(cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes())); - // add document, this should fail since there is no leader. Passive replica should not accept the update + // add document, this should fail since there is no leader. Pull replica should not accept the update expectThrows(SolrException.class, () -> cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "zoo")) ); - // Also fails if I send the update to the passive replica explicitly - try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { + // Also fails if I send the update to the pull replica explicitly + try (HttpSolrClient pullReplicaClient = getHttpSolrClient(docCollection.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) { expectThrows(SolrException.class, () -> cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "2", "foo", "zoo")) ); } // Queries should still work - waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); - // Add realtime replica back. Since there is no rt now, new rt will have no docs. There will be data loss, since the it will become the leader - // and passive replicas will replicate from it. Maybe we want to change this. Replicate from passive replicas is not a good idea, since they + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PULL))); + // Add nrt replica back. Since there is no rt now, new rt will have no docs. There will be data loss, since the it will become the leader + // and pull replicas will replicate from it. Maybe we want to change this. Replicate from pull replicas is not a good idea, since they // are by definition out of date. if (removeReplica) { - CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.REALTIME).process(cluster.getSolrClient()); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.NRT).process(cluster.getSolrClient()); } else { ChaosMonkey.start(leaderJetty); } @@ -428,8 +406,8 @@ private void doTestNoLeader(boolean removeReplica) throws Exception { //nocommit: If jetty is restarted, the replication is not forced, and replica doesn't replicate from leader until new docs are added. Is this the correct behavior? Why should these two cases be different? if (removeReplica) { - // Passive replicas will replicate the empty index if a new replica was added and becomes leader - waitForNumDocsInAllReplicas(0, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); + // Pull replicas will replicate the empty index if a new replica was added and becomes leader + waitForNumDocsInAllReplicas(0, docCollection.getReplicas(EnumSet.of(Replica.Type.PULL))); } // add docs agin @@ -439,11 +417,11 @@ private void doTestNoLeader(boolean removeReplica) throws Exception { leaderClient.commit(); assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); } - waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)), "id:2"); - waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE))); + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PULL)), "id:2"); + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.PULL))); } - public void testKillPassiveReplica() throws Exception { + public void testKillPullReplica() throws Exception { CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, 1) .setMaxShardsPerNode(100) .process(cluster.getSolrClient()); @@ -457,8 +435,8 @@ public void testKillPassiveReplica() throws Exception { cluster.getSolrClient().commit(collectionName); waitForNumDocsInAllActiveReplicas(1); - JettySolrRunner passiveReplicaJetty = cluster.getReplicaJetty(docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0)); - ChaosMonkey.kill(passiveReplicaJetty); + JettySolrRunner pullReplicaJetty = cluster.getReplicaJetty(docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PULL)).get(0)); + ChaosMonkey.kill(pullReplicaJetty); waitForState("Replica not removed", collectionName, activeReplicaCount(1, 0, 0)); // Also wait for the replica to be placed in state="down" waitForState("Didn't update state", collectionName, clusterStateReflectsActiveAndDownReplicas()); @@ -467,7 +445,7 @@ public void testKillPassiveReplica() throws Exception { cluster.getSolrClient().commit(collectionName); waitForNumDocsInAllActiveReplicas(2); - ChaosMonkey.start(passiveReplicaJetty); + ChaosMonkey.start(pullReplicaJetty); waitForState("Replica not added", collectionName, activeReplicaCount(1, 0, 1)); waitForNumDocsInAllActiveReplicas(2); } @@ -523,18 +501,18 @@ private void waitForDeletion(String collection) throws InterruptedException, Kee } } - private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int numPassive, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { + private DocCollection assertNumberOfReplicas(int numNrtReplicas, int numTlogReplicas, int numPullReplicas, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { if (updateCollection) { cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collectionName); } DocCollection docCollection = getCollectionState(collectionName); assertNotNull(docCollection); - assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, - docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, - docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - assertEquals("Unexpected number of active replicas: " + docCollection, numActive, - docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of writer replicas: " + docCollection, numNrtReplicas, + docCollection.getReplicas(EnumSet.of(Replica.Type.NRT)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of pull replicas: " + docCollection, numPullReplicas, + docCollection.getReplicas(EnumSet.of(Replica.Type.PULL)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of active replicas: " + docCollection, numTlogReplicas, + docCollection.getReplicas(EnumSet.of(Replica.Type.TLOG)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); return docCollection; } @@ -559,30 +537,30 @@ private CollectionStatePredicate clusterStateReflectsActiveAndDownReplicas() { } - private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive, int numPassive) { + private CollectionStatePredicate activeReplicaCount(int numNrtReplicas, int numTlogReplicas, int numPullReplicas) { return (liveNodes, collectionState) -> { - int writersFound = 0, activesFound = 0, passivesFound = 0; + int nrtFound = 0, tlogFound = 0, pullFound = 0; if (collectionState == null) return false; for (Slice slice : collectionState) { for (Replica replica : slice) { if (replica.isActive(liveNodes)) switch (replica.getType()) { - case APPEND: - activesFound++; + case TLOG: + tlogFound++; break; - case PASSIVE: - passivesFound++; + case PULL: + pullFound++; break; - case REALTIME: - writersFound++; + case NRT: + nrtFound++; break; default: throw new AssertionError("Unexpected replica type"); } } } - return numWriter == writersFound && numActive == activesFound && numPassive == passivesFound; + return numNrtReplicas == nrtFound && numTlogReplicas == tlogFound && numPullReplicas == pullFound; }; } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaErrorHandling.java similarity index 84% rename from solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java rename to solr/core/src/test/org/apache/solr/cloud/TestPullReplicaErrorHandling.java index f331ef6cd1fb..8d18bec40634 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPassiveReplicaErrorHandling.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaErrorHandling.java @@ -52,7 +52,7 @@ import org.slf4j.LoggerFactory; @SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776") -public class TestPassiveReplicaErrorHandling extends SolrCloudTestCase { +public class TestPullReplicaErrorHandling extends SolrCloudTestCase { private final static int REPLICATION_TIMEOUT_SECS = 10; @@ -132,7 +132,7 @@ public void tearDown() throws Exception { } // @Repeat(iterations=10) - public void testCantConnectToPassiveReplica() throws Exception { + public void testCantConnectToPullReplica() throws Exception { int numShards = 2; CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1, 0, 1) .setMaxShardsPerNode(1) @@ -140,7 +140,7 @@ public void testCantConnectToPassiveReplica() throws Exception { addDocs(10); DocCollection docCollection = assertNumberOfReplicas(numShards, 0, numShards, false, true); Slice s = docCollection.getSlices().iterator().next(); - SocketProxy proxy = getProxyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0)); + SocketProxy proxy = getProxyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0)); try { proxy.close(); for (int i = 1; i <= 10; i ++) { @@ -149,9 +149,9 @@ public void testCantConnectToPassiveReplica() throws Exception { assertNumDocs(10 + i, leaderClient); } } - try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { - passiveReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound(); - fail("Shouldn't be able to query the passive replica"); + try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) { + pullReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound(); + fail("Shouldn't be able to query the pull replica"); } catch (SolrServerException e) { //expected } @@ -168,8 +168,8 @@ public void testCantConnectToPassiveReplica() throws Exception { proxy.reopen(); } - try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { - assertNumDocs(20, passiveReplicaClient); + try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) { + assertNumDocs(20, pullReplicaClient); } } @@ -184,13 +184,13 @@ public void testCantConnectToLeader() throws Exception { SocketProxy proxy = getProxyForReplica(s.getLeader()); try { // wait for replication - try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { - assertNumDocs(10, passiveReplicaClient); + try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) { + assertNumDocs(10, pullReplicaClient); } proxy.close(); expectThrows(SolrException.class, ()->addDocs(1)); - try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { - assertNumDocs(10, passiveReplicaClient); + try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) { + assertNumDocs(10, pullReplicaClient); } assertNumDocs(10, cluster.getSolrClient()); } finally { @@ -198,7 +198,7 @@ public void testCantConnectToLeader() throws Exception { } } - public void testPassiveReplicaDisconnectsFromZooKeeper() throws Exception { + public void testPullReplicaDisconnectsFromZooKeeper() throws Exception { int numShards = 1; CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1, 0, 1) .setMaxShardsPerNode(1) @@ -206,18 +206,18 @@ public void testPassiveReplicaDisconnectsFromZooKeeper() throws Exception { addDocs(10); DocCollection docCollection = assertNumberOfReplicas(numShards, 0, numShards, false, true); Slice s = docCollection.getSlices().iterator().next(); - try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { - assertNumDocs(10, passiveReplicaClient); + try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) { + assertNumDocs(10, pullReplicaClient); } addDocs(20); - JettySolrRunner jetty = getJettyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0)); + JettySolrRunner jetty = getJettyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0)); cluster.expireZkSession(jetty); addDocs(30); waitForState("Expecting node to be disconnected", collectionName, activeReplicaCount(1, 0, 0)); addDocs(40); waitForState("Expecting node to be disconnected", collectionName, activeReplicaCount(1, 0, 1)); - try (HttpSolrClient passiveReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).get(0).getCoreUrl())) { - assertNumDocs(40, passiveReplicaClient); + try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) { + assertNumDocs(40, pullReplicaClient); } } @@ -251,11 +251,11 @@ private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int n DocCollection docCollection = getCollectionState(collectionName); assertNotNull(docCollection); assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, - docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, - docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + docCollection.getReplicas(EnumSet.of(Replica.Type.NRT)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of pull replicas: " + docCollection, numPassive, + docCollection.getReplicas(EnumSet.of(Replica.Type.PULL)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); assertEquals("Unexpected number of active replicas: " + docCollection, numActive, - docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + docCollection.getReplicas(EnumSet.of(Replica.Type.TLOG)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); return docCollection; } @@ -309,13 +309,13 @@ private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive for (Replica replica : slice) { if (replica.isActive(liveNodes)) switch (replica.getType()) { - case APPEND: + case TLOG: activesFound++; break; - case PASSIVE: + case PULL: passivesFound++; break; - case REALTIME: + case NRT: writersFound++; break; default: diff --git a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java similarity index 89% rename from solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java rename to solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java index a0cc1f50a81c..57f25d05b4b7 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestAppendReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java @@ -69,7 +69,7 @@ import com.carrotsearch.randomizedtesting.annotations.Repeat; @Slow -public class TestAppendReplica extends SolrCloudTestCase { +public class TestTlogReplica extends SolrCloudTestCase { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -122,8 +122,8 @@ public void tearDown() throws Exception { } /** - * Asserts that Update logs exist for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#REALTIME}, but not - * for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} + * Asserts that Update logs exist for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#NRT}, but not + * for replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PULL} */ private void assertUlogPresence(DocCollection collection) { for (Slice s:collection.getSlices()) { @@ -153,14 +153,14 @@ public void testCreateDelete() throws Exception { 2, docCollection.getSlices().size()); assertEquals("Expecting 4 relpicas per shard", 8, docCollection.getReplicas().size()); - assertEquals("Expecting 8 append replicas, 4 per shard", - 8, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).size()); - assertEquals("Expecting no realtime replicas", - 0, docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).size()); - assertEquals("Expecting no passive replicas", - 0, docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).size()); + assertEquals("Expecting 8 tlog replicas, 4 per shard", + 8, docCollection.getReplicas(EnumSet.of(Replica.Type.TLOG)).size()); + assertEquals("Expecting no nrt replicas", + 0, docCollection.getReplicas(EnumSet.of(Replica.Type.NRT)).size()); + assertEquals("Expecting no pull replicas", + 0, docCollection.getReplicas(EnumSet.of(Replica.Type.PULL)).size()); for (Slice s:docCollection.getSlices()) { - assertTrue(s.getLeader().getType() == Replica.Type.APPEND); + assertTrue(s.getLeader().getType() == Replica.Type.TLOG); List shardElectionNodes = cluster.getZkClient().getChildren(ZkStateReader.getShardLeadersElectPath(collectionName, s.getName()), null, true); assertEquals("Unexpected election nodes for Shard: " + s.getName() + ": " + Arrays.toString(shardElectionNodes.toArray()), 4, shardElectionNodes.size()); @@ -173,8 +173,8 @@ public void testCreateDelete() throws Exception { @SuppressWarnings("unchecked") public void testAddDocs() throws Exception { - int numAppendReplicas = 1 + random().nextInt(3); - DocCollection docCollection = createAndWaitForCollection(1, 0, numAppendReplicas, 0); + int numTlogReplicas = 1 + random().nextInt(3); + DocCollection docCollection = createAndWaitForCollection(1, 0, numTlogReplicas, 0); assertEquals(1, docCollection.getSlices().size()); cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar")); @@ -186,18 +186,18 @@ public void testAddDocs() throws Exception { } TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS); - for (Replica r:s.getReplicas(EnumSet.of(Replica.Type.APPEND))) { + for (Replica r:s.getReplicas(EnumSet.of(Replica.Type.TLOG))) { //TODO: assert replication < REPLICATION_TIMEOUT_SECS - try (HttpSolrClient appendReplicaClient = getHttpSolrClient(r.getCoreUrl())) { + try (HttpSolrClient tlogReplicaClient = getHttpSolrClient(r.getCoreUrl())) { while (true) { try { assertEquals("Replica " + r.getName() + " not up to date after 10 seconds", - 1, appendReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound()); + 1, tlogReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound()); // Append replicas process all updates SolrQuery req = new SolrQuery( "qt", "/admin/plugins", "stats", "true"); - QueryResponse statsResponse = appendReplicaClient.query(req); + QueryResponse statsResponse = tlogReplicaClient.query(req); assertEquals("Append replicas should recive all updates. Replica: " + r + ", response: " + statsResponse, 1L, ((Map)((NamedList)statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats")).get("UPDATE.updateHandler.cumulativeAdds.count")); break; @@ -214,22 +214,22 @@ public void testAddDocs() throws Exception { assertUlogPresence(docCollection); } - public void testAddRemoveAppendReplica() throws Exception { + public void testAddRemoveTlogReplica() throws Exception { DocCollection docCollection = createAndWaitForCollection(2, 0, 1, 0); assertEquals(2, docCollection.getSlices().size()); - CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.APPEND).process(cluster.getSolrClient()); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.TLOG).process(cluster.getSolrClient()); docCollection = assertNumberOfReplicas(0, 3, 0, true, false); - CollectionAdminRequest.addReplicaToShard(collectionName, "shard2", Replica.Type.APPEND).process(cluster.getSolrClient()); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard2", Replica.Type.TLOG).process(cluster.getSolrClient()); docCollection = assertNumberOfReplicas(0, 4, 0, true, false); waitForState("Expecting collection to have 2 shards and 2 replica each", collectionName, clusterShape(2, 2)); - //Delete passive replica from shard1 + //Delete pull replica from shard1 CollectionAdminRequest.deleteReplica( collectionName, "shard1", - docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.APPEND)).get(0).getName()) + docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.TLOG)).get(0).getName()) .process(cluster.getSolrClient()); assertNumberOfReplicas(0, 3, 0, true, true); } @@ -245,12 +245,12 @@ public void testKillLeader() throws Exception { public void testRealTimeGet() throws SolrServerException, IOException, KeeperException, InterruptedException { // should be redirected to Replica.Type.REALTIME int numReplicas = random().nextBoolean()?1:2; - int numRealtimeReplicas = random().nextBoolean()?0:2; - CollectionAdminRequest.createCollection(collectionName, "conf", 1, numRealtimeReplicas, numReplicas, 0) + int numNrtReplicas = random().nextBoolean()?0:2; + CollectionAdminRequest.createCollection(collectionName, "conf", 1, numNrtReplicas, numReplicas, 0) .setMaxShardsPerNode(100) .process(cluster.getSolrClient()); - waitForState("Unexpected replica count", collectionName, activeReplicaCount(numRealtimeReplicas, numReplicas, 0)); - DocCollection docCollection = assertNumberOfReplicas(numRealtimeReplicas, numReplicas, 0, false, true); + waitForState("Unexpected replica count", collectionName, activeReplicaCount(numNrtReplicas, numReplicas, 0)); + DocCollection docCollection = assertNumberOfReplicas(numNrtReplicas, numReplicas, 0, false, true); HttpClient httpClient = cluster.getSolrClient().getHttpClient(); int id = 0; Slice slice = docCollection.getSlice("shard1"); @@ -300,7 +300,7 @@ private void doReplaceLeader(boolean removeReplica) throws Exception { assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound()); } - waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)), REPLICATION_TIMEOUT_SECS); + waitForNumDocsInAllReplicas(1, docCollection.getReplicas(EnumSet.of(Replica.Type.TLOG)), REPLICATION_TIMEOUT_SECS); // Delete leader replica from shard1 JettySolrRunner leaderJetty = null; @@ -337,19 +337,19 @@ private void doReplaceLeader(boolean removeReplica) throws Exception { cluster.getSolrClient().commit(collectionName); // Queries should still work - waitForNumDocsInAllReplicas(2, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)), REPLICATION_TIMEOUT_SECS); + waitForNumDocsInAllReplicas(2, docCollection.getReplicas(EnumSet.of(Replica.Type.TLOG)), REPLICATION_TIMEOUT_SECS); // Start back the node if (removeReplica) { - CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.APPEND).process(cluster.getSolrClient()); + CollectionAdminRequest.addReplicaToShard(collectionName, "shard1", Replica.Type.TLOG).process(cluster.getSolrClient()); } else { ChaosMonkey.start(leaderJetty); } waitForState("Expected collection to be 1x2", collectionName, clusterShape(1, 2)); // added replica should replicate from the leader - waitForNumDocsInAllReplicas(2, docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)), REPLICATION_TIMEOUT_SECS); + waitForNumDocsInAllReplicas(2, docCollection.getReplicas(EnumSet.of(Replica.Type.TLOG)), REPLICATION_TIMEOUT_SECS); } - public void testKillAppendReplica() throws Exception { + public void testKillTlogReplica() throws Exception { DocCollection docCollection = createAndWaitForCollection(1, 0, 2, 0); waitForNumDocsInAllActiveReplicas(0); @@ -357,8 +357,8 @@ public void testKillAppendReplica() throws Exception { cluster.getSolrClient().commit(collectionName); waitForNumDocsInAllActiveReplicas(1); - JettySolrRunner passiveReplicaJetty = cluster.getReplicaJetty(docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.APPEND)).get(0)); - ChaosMonkey.kill(passiveReplicaJetty); + JettySolrRunner pullReplicaJetty = cluster.getReplicaJetty(docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.TLOG)).get(0)); + ChaosMonkey.kill(pullReplicaJetty); waitForState("Replica not removed", collectionName, activeReplicaCount(0, 1, 0)); // Also wait for the replica to be placed in state="down" waitForState("Didn't update state", collectionName, clusterStateReflectsActiveAndDownReplicas()); @@ -367,7 +367,7 @@ public void testKillAppendReplica() throws Exception { cluster.getSolrClient().commit(collectionName); waitForNumDocsInAllActiveReplicas(2); - ChaosMonkey.start(passiveReplicaJetty); + ChaosMonkey.start(pullReplicaJetty); waitForState("Replica not added", collectionName, activeReplicaCount(0, 2, 0)); waitForNumDocsInAllActiveReplicas(2); } @@ -623,15 +623,15 @@ private String getBaseUrl() { return slice.getLeader().getCoreUrl(); } - private DocCollection createAndWaitForCollection(int numShards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) throws SolrServerException, IOException, KeeperException, InterruptedException { - CollectionAdminRequest.createCollection(collectionName, "conf", numShards, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas) + private DocCollection createAndWaitForCollection(int numShards, int numNrtReplicas, int numTlogReplicas, int numPullReplicas) throws SolrServerException, IOException, KeeperException, InterruptedException { + CollectionAdminRequest.createCollection(collectionName, "conf", numShards, numNrtReplicas, numTlogReplicas, numPullReplicas) .setMaxShardsPerNode(100) .process(cluster.getSolrClient()); - int numReplicasPerShard = numRealtimeReplicas + numAppendReplicas + numPassiveReplicas; + int numReplicasPerShard = numNrtReplicas + numTlogReplicas + numPullReplicas; cluster.getSolrClient().getZkStateReader().registerCore(collectionName); //TODO: Is this needed? waitForState("Expected collection to be created with " + numShards + " shards and " + numReplicasPerShard + " replicas", collectionName, clusterShape(numShards, numReplicasPerShard)); - return assertNumberOfReplicas(numRealtimeReplicas*numShards, numAppendReplicas*numShards, numPassiveReplicas*numShards, false, true); + return assertNumberOfReplicas(numNrtReplicas*numShards, numTlogReplicas*numShards, numPullReplicas*numShards, false, true); } private void waitForNumDocsInAllActiveReplicas(int numDocs) throws IOException, SolrServerException, InterruptedException { @@ -687,18 +687,18 @@ private void waitForDeletion(String collection) throws InterruptedException, Kee } } - private DocCollection assertNumberOfReplicas(int numWriter, int numActive, int numPassive, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { + private DocCollection assertNumberOfReplicas(int numNrtReplicas, int numTlogReplicas, int numPullReplicas, boolean updateCollection, boolean activeOnly) throws KeeperException, InterruptedException { if (updateCollection) { cluster.getSolrClient().getZkStateReader().forceUpdateCollection(collectionName); } DocCollection docCollection = getCollectionState(collectionName); assertNotNull(docCollection); - assertEquals("Unexpected number of writer replicas: " + docCollection, numWriter, - docCollection.getReplicas(EnumSet.of(Replica.Type.REALTIME)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - assertEquals("Unexpected number of passive replicas: " + docCollection, numPassive, - docCollection.getReplicas(EnumSet.of(Replica.Type.PASSIVE)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); - assertEquals("Unexpected number of active replicas: " + docCollection, numActive, - docCollection.getReplicas(EnumSet.of(Replica.Type.APPEND)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of nrt replicas: " + docCollection, numNrtReplicas, + docCollection.getReplicas(EnumSet.of(Replica.Type.NRT)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of pull replicas: " + docCollection, numPullReplicas, + docCollection.getReplicas(EnumSet.of(Replica.Type.PULL)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); + assertEquals("Unexpected number of tlog replicas: " + docCollection, numTlogReplicas, + docCollection.getReplicas(EnumSet.of(Replica.Type.TLOG)).stream().filter(r->!activeOnly || r.getState() == Replica.State.ACTIVE).count()); return docCollection; } @@ -723,30 +723,30 @@ private CollectionStatePredicate clusterStateReflectsActiveAndDownReplicas() { } - private CollectionStatePredicate activeReplicaCount(int numWriter, int numActive, int numPassive) { + private CollectionStatePredicate activeReplicaCount(int numNrtReplicas, int numTlogReplicas, int numPullReplicas) { return (liveNodes, collectionState) -> { - int writersFound = 0, activesFound = 0, passivesFound = 0; + int nrtFound = 0, tlogFound = 0, pullFound = 0; if (collectionState == null) return false; for (Slice slice : collectionState) { for (Replica replica : slice) { if (replica.isActive(liveNodes)) switch (replica.getType()) { - case APPEND: - activesFound++; + case TLOG: + tlogFound++; break; - case PASSIVE: - passivesFound++; + case PULL: + pullFound++; break; - case REALTIME: - writersFound++; + case NRT: + nrtFound++; break; default: throw new AssertionError("Unexpected replica type"); } } } - return numWriter == writersFound && numActive == activesFound && numPassive == passivesFound; + return numNrtReplicas == nrtFound && numTlogReplicas == tlogFound && numPullReplicas == pullFound; }; } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java index f66f89257378..73a28519a526 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestTolerantUpdateProcessorCloud.java @@ -86,10 +86,10 @@ public class TestTolerantUpdateProcessorCloud extends SolrCloudTestCase { /** A client for talking directly to the leader of shard2 */ private static HttpSolrClient S_TWO_LEADER_CLIENT; - /** A client for talking directly to a passive replica of shard1 */ + /** A client for talking directly to a pull replica of shard1 */ private static HttpSolrClient S_ONE_NON_LEADER_CLIENT; - /** A client for talking directly to a passive replica of shard2 */ + /** A client for talking directly to a pull replica of shard2 */ private static HttpSolrClient S_TWO_NON_LEADER_CLIENT; /** A client for talking directly to a node that has no piece of the collection */ diff --git a/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java index ba92a02323bb..1534162f9314 100644 --- a/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsBasicDistributedZkTest.java @@ -44,7 +44,7 @@ public static void setupClass() throws Exception { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return false; } diff --git a/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java b/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java index ebb758d5ede4..44b4a4ed010b 100644 --- a/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java +++ b/solr/core/src/test/org/apache/solr/update/TestInPlaceUpdatesDistrib.java @@ -111,7 +111,7 @@ public static void beforeSuperClass() throws Exception { } @Override - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return onlyLeaderIndexes; } @@ -270,7 +270,7 @@ private void reorderedDBQsSimpleTest() throws Exception { private void reorderedDBQIndividualReplicaTest() throws Exception { if (onlyLeaderIndexes) { - log.info("RTG with DBQs are not working in append replicas"); + log.info("RTG with DBQs are not working in tlog replicas"); return; } clearIndex(); @@ -743,7 +743,7 @@ private void reorderedDeletesTest() throws Exception { */ private void reorderedDBQsResurrectionTest() throws Exception { if (onlyLeaderIndexes) { - log.info("RTG with DBQs are not working in append replicas"); + log.info("RTG with DBQs are not working in tlog replicas"); return; } clearIndex(); @@ -1145,7 +1145,7 @@ protected List buildRandomIndex(Float initFloat, List specialIds) */ private void reorderedDBQsUsingUpdatedValueFromADroppedUpdate() throws Exception { if (onlyLeaderIndexes) { - log.info("RTG with DBQs are not working in append replicas"); + log.info("RTG with DBQs are not working in tlog replicas"); return; } clearIndex(); diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index 0f062f505a1b..e9b370077df2 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -897,7 +897,7 @@ private Map> buildUrlMap(DocCollection col) { String url = zkProps.getCoreUrl(); urls.add(url); if (!directUpdatesToLeadersOnly) { - for (Replica replica : slice.getReplicas(EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME))) { + for (Replica replica : slice.getReplicas(EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT))) { if (!replica.getNodeName().equals(leader.getNodeName()) && !replica.getName().equals(leader.getName())) { ZkCoreNodeProps zkProps1 = new ZkCoreNodeProps(replica); diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java index 10b5c6969942..d5a2af6962c9 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java @@ -322,12 +322,12 @@ public SolrParams getParams() { * @param collection the collection name * @param config the collection config * @param numShards the number of shards in the collection - * @param numRealtimeReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#REALTIME} replicas - * @param numAppendReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#APPEND} replicas - * @param numPassiveReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} replicas + * @param numNrtReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#NRT} replicas + * @param numTlogReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#TLOG} replicas + * @param numPullReplicas the number of {@link org.apache.solr.common.cloud.Replica.Type#PULL} replicas */ - public static Create createCollection(String collection, String config, int numShards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) { - return new Create(collection, config, numShards, numRealtimeReplicas, numAppendReplicas, numPassiveReplicas); + public static Create createCollection(String collection, String config, int numShards, int numNrtReplicas, int numTlogReplicas, int numPullReplicas) { + return new Create(collection, config, numShards, numNrtReplicas, numTlogReplicas, numPullReplicas); } /** @@ -371,14 +371,14 @@ public static Create createCollectionWithImplicitRouter(String collection, Strin * @param collection the collection name * @param config the collection config * @param shards a shard definition string - * @param numRealtimeReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#REALTIME} - * @param numAppendReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#APPEND} - * @param numPassiveReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PASSIVE} + * @param numNrtReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#NRT} + * @param numTlogReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#TLOG} + * @param numPullReplicas the number of replicas of type {@link org.apache.solr.common.cloud.Replica.Type#PULL} */ - public static Create createCollectionWithImplicitRouter(String collection, String config, String shards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) { - Create createRequest = new Create(collection, config, shards, numRealtimeReplicas); - createRequest.appendReplicas = numAppendReplicas; - createRequest.passiveReplicas = numPassiveReplicas; + public static Create createCollectionWithImplicitRouter(String collection, String config, String shards, int numNrtReplicas, int numTlogReplicas, int numPullReplicas) { + Create createRequest = new Create(collection, config, shards, numNrtReplicas); + createRequest.tlogReplicas = numTlogReplicas; + createRequest.pullReplicas = numPullReplicas; return createRequest; } @@ -392,9 +392,9 @@ public static class Create extends AsyncCollectionSpecificAdminRequest { protected String routerField; protected Integer numShards; protected Integer maxShardsPerNode; - protected Integer realtimeReplicas; - protected Integer passiveReplicas; - protected Integer appendReplicas; + protected Integer nrtReplicas; + protected Integer pullReplicas; + protected Integer tlogReplicas; private Properties properties; protected Boolean autoAddReplicas; @@ -409,19 +409,19 @@ public Create() { super(CollectionAction.CREATE, null); } - private Create(String collection, String config, int numShards, int numRealtimeReplicas, int numAppendReplicas, int numPassiveReplicas) { // TODO: maybe add other constructors + private Create(String collection, String config, int numShards, int numNrtReplicas, int numTlogReplicas, int numPullReplicas) { // TODO: maybe add other constructors super(CollectionAction.CREATE, SolrIdentifierValidator.validateCollectionName(collection)); this.configName = config; this.numShards = numShards; - this.realtimeReplicas = numRealtimeReplicas; - this.passiveReplicas = numPassiveReplicas; - this.appendReplicas = numAppendReplicas; + this.nrtReplicas = numNrtReplicas; + this.pullReplicas = numPullReplicas; + this.tlogReplicas = numTlogReplicas; } - private Create(String collection, String config, String shards, int numRealtimeReplicas) { + private Create(String collection, String config, String shards, int numNrtReplicas) { super(CollectionAction.CREATE, SolrIdentifierValidator.validateCollectionName(collection)); this.configName = config; - this.realtimeReplicas = numRealtimeReplicas; + this.nrtReplicas = numNrtReplicas; this.shards = shards; this.routerName = ImplicitDocRouter.NAME; } @@ -435,10 +435,10 @@ private Create(String collection, String config, String shards, int numRealtimeR public Create setNumShards(Integer numShards) {this.numShards = numShards; return this; } public Create setMaxShardsPerNode(Integer numShards) { this.maxShardsPerNode = numShards; return this; } public Create setAutoAddReplicas(boolean autoAddReplicas) { this.autoAddReplicas = autoAddReplicas; return this; } - public Create setRealtimeReplicas(Integer realtimeReplicas) { this.realtimeReplicas = realtimeReplicas; return this;} - public Create setAppendReplicas(Integer appendReplicas) { this.appendReplicas = appendReplicas; return this;} + public Create setNrtReplicas(Integer nrtReplicas) { this.nrtReplicas = nrtReplicas; return this;} + public Create setTlogReplicas(Integer tlogReplicas) { this.tlogReplicas = tlogReplicas; return this;} - public Create setReplicationFactor(Integer repl) { this.realtimeReplicas = repl; return this; } + public Create setReplicationFactor(Integer repl) { this.nrtReplicas = repl; return this; } public Create setStateFormat(Integer stateFormat) { this.stateFormat = stateFormat; return this; } public Create setRule(String... s){ this.rule = s; return this; } public Create setSnitch(String... s){ this.snitch = s; return this; } @@ -450,11 +450,11 @@ private Create(String collection, String config, String shards, int numRealtimeR public Integer getNumShards() { return numShards; } public Integer getMaxShardsPerNode() { return maxShardsPerNode; } - public Integer getReplicationFactor() { return getNumRealtimeReplicas(); } - public Integer getNumRealtimeReplicas() { return realtimeReplicas; } + public Integer getReplicationFactor() { return getNumNrtReplicas(); } + public Integer getNumNrtReplicas() { return nrtReplicas; } public Boolean getAutoAddReplicas() { return autoAddReplicas; } - public Integer getRealtimeReplicas() { return realtimeReplicas; } - public Integer getAppendReplicas() {return appendReplicas;} + public Integer getNrtReplicas() { return nrtReplicas; } + public Integer getTlogReplicas() {return tlogReplicas;} public Integer getStateFormat() { return stateFormat; } @@ -536,9 +536,9 @@ public SolrParams getParams() { if (routerField != null) { params.set("router.field", routerField); } - if (realtimeReplicas != null) { - params.set( "replicationFactor", realtimeReplicas);// Keep both for compatibility? - params.set( ZkStateReader.REALTIME_REPLICAS, realtimeReplicas); + if (nrtReplicas != null) { + params.set( "replicationFactor", nrtReplicas);// Keep both for compatibility? + params.set( ZkStateReader.NRT_REPLICAS, nrtReplicas); } if (autoAddReplicas != null) { params.set(ZkStateReader.AUTO_ADD_REPLICAS, autoAddReplicas); @@ -549,11 +549,11 @@ public SolrParams getParams() { if (stateFormat != null) { params.set(DocCollection.STATE_FORMAT, stateFormat); } - if (passiveReplicas != null) { - params.set(ZkStateReader.PASSIVE_REPLICAS, passiveReplicas); + if (pullReplicas != null) { + params.set(ZkStateReader.PULL_REPLICAS, pullReplicas); } - if (appendReplicas != null) { - params.set(ZkStateReader.APPEND_REPLICAS, appendReplicas); + if (tlogReplicas != null) { + params.set(ZkStateReader.TLOG_REPLICAS, tlogReplicas); } if(rule != null) params.set("rule", rule); if(snitch != null) params.set("snitch", snitch); @@ -1656,7 +1656,7 @@ public SolrParams getParams() { * Returns a SolrRequest to add a replica to a shard in a collection */ public static AddReplica addReplicaToShard(String collection, String shard) { - return addReplicaToShard(collection, shard, Replica.Type.REALTIME); + return addReplicaToShard(collection, shard, Replica.Type.NRT); } /** @@ -1670,7 +1670,7 @@ public static AddReplica addReplicaToShard(String collection, String shard, Repl * Returns a SolrRequest to add a replica to a collection using a route key */ public static AddReplica addReplicaByRouteKey(String collection, String routeKey) { - return new AddReplica(collection, null, routeKey, Replica.Type.REALTIME); + return new AddReplica(collection, null, routeKey, Replica.Type.NRT); } // ADDREPLICA request diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java b/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java index 3b6ac803776c..47bdce37a2b2 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/DocCollection.java @@ -35,9 +35,9 @@ import static org.apache.solr.common.cloud.ZkStateReader.AUTO_ADD_REPLICAS; import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE; import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR; -import static org.apache.solr.common.cloud.ZkStateReader.REALTIME_REPLICAS; -import static org.apache.solr.common.cloud.ZkStateReader.APPEND_REPLICAS; -import static org.apache.solr.common.cloud.ZkStateReader.PASSIVE_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS; +import static org.apache.solr.common.cloud.ZkStateReader.PULL_REPLICAS; /** * Models a Collection in zookeeper (but that Java name is obviously taken, hence "DocCollection") @@ -61,9 +61,9 @@ public class DocCollection extends ZkNodeProps implements Iterable { private final String znode; private final Integer replicationFactor; - private final Integer numRealtimeReplicas; - private final Integer numAppendReplicas; - private final Integer numPassiveReplicas; + private final Integer numNrtReplicas; + private final Integer numTlogReplicas; + private final Integer numPullReplicas; private final Integer maxShardsPerNode; private final Boolean autoAddReplicas; @@ -87,9 +87,9 @@ public DocCollection(String name, Map slices, Map this.nodeNameLeaderReplicas = new HashMap<>(); this.nodeNameReplicas = new HashMap<>(); this.replicationFactor = (Integer) verifyProp(props, REPLICATION_FACTOR); - this.numRealtimeReplicas = (Integer) verifyProp(props, REALTIME_REPLICAS); - this.numAppendReplicas = (Integer) verifyProp(props, APPEND_REPLICAS); - this.numPassiveReplicas = (Integer) verifyProp(props, PASSIVE_REPLICAS); + this.numNrtReplicas = (Integer) verifyProp(props, NRT_REPLICAS); + this.numTlogReplicas = (Integer) verifyProp(props, TLOG_REPLICAS); + this.numPullReplicas = (Integer) verifyProp(props, PULL_REPLICAS); this.maxShardsPerNode = (Integer) verifyProp(props, MAX_SHARDS_PER_NODE); Boolean autoAddReplicas = (Boolean) verifyProp(props, AUTO_ADD_REPLICAS); this.autoAddReplicas = autoAddReplicas == null ? Boolean.FALSE : autoAddReplicas; @@ -136,9 +136,9 @@ public static Object verifyProp(Map props, String propName) { switch (propName) { case MAX_SHARDS_PER_NODE: case REPLICATION_FACTOR: - case REALTIME_REPLICAS: - case PASSIVE_REPLICAS: - case APPEND_REPLICAS: + case NRT_REPLICAS: + case PULL_REPLICAS: + case TLOG_REPLICAS: return Integer.parseInt(o.toString()); case AUTO_ADD_REPLICAS: return Boolean.parseBoolean(o.toString()); @@ -342,16 +342,16 @@ public boolean equals(Object that) { return super.equals(that) && Objects.equals(this.znode, other.znode) && this.znodeVersion == other.znodeVersion; } - public Integer getNumRealtimeReplicas() { - return numRealtimeReplicas; + public Integer getNumNrtReplicas() { + return numNrtReplicas; } - public Integer getNumAppendReplicas() { - return numAppendReplicas; + public Integer getNumTlogReplicas() { + return numTlogReplicas; } - public Integer getNumPassiveReplicas() { - return numPassiveReplicas; + public Integer getNumPullReplicas() { + return numPullReplicas; } } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java index 8f3ed15b5435..b7655be31354 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Replica.java @@ -86,23 +86,23 @@ public static State getState(String stateStr) { public enum Type { /** - * Writes updates to transaction log and indexes locally. Replicas of type {@link #REALTIME} support NRT (soft commits) and RTG. - * Any {@link #REALTIME} replica can become a leader. A shard leader will forward updates to all active {@link #REALTIME} and - * {@link #APPEND} replicas. + * Writes updates to transaction log and indexes locally. Replicas of type {@link Type#NRT} support NRT (soft commits) and RTG. + * Any {@link Type#NRT} replica can become a leader. A shard leader will forward updates to all active {@link Type#NRT} and + * {@link Type#TLOG} replicas. */ - REALTIME, + NRT, /** - * Writes to transaction log, but not to index, uses replication. Any {@link #APPEND} replica can become leader (by first - * applying all local transaction log elements). If a replica is of type {@link #APPEND} but is also the leader, it will behave - * as a {@link #REALTIME}. A shard leader will forward updates to all active {@link #REALTIME} and {@link #APPEND} replicas. + * Writes to transaction log, but not to index, uses replication. Any {@link Type#TLOG} replica can become leader (by first + * applying all local transaction log elements). If a replica is of type {@link Type#TLOG} but is also the leader, it will behave + * as a {@link Type#NRT}. A shard leader will forward updates to all active {@link Type#NRT} and {@link Type#TLOG} replicas. */ - APPEND, + TLOG, /** - * Doesn’t index or writes to transaction log. Just replicates from {@link #REALTIME} or {@link #APPEND} replicas. {@link #PASSIVE} - * replicas can’t become shard leaders (i.e., if there are only passive replicas in the collection at some point, updates will fail + * Doesn’t index or writes to transaction log. Just replicates from {@link Type#NRT} or {@link Type#TLOG} replicas. {@link Type#PULL} + * replicas can’t become shard leaders (i.e., if there are only pull replicas in the collection at some point, updates will fail * same as if there is no leaders, queries continue to work), so they don’t even participate in elections. */ - PASSIVE + PULL } private final String name; @@ -122,7 +122,7 @@ public Replica(String name, Map propMap) { } String typeString = (String)propMap.get(ZkStateReader.REPLICA_TYPE); if (typeString == null) { - this.type = Type.REALTIME; + this.type = Type.NRT; } else { this.type = Type.valueOf(typeString); } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java index 3f84e1e89455..844714d19b73 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/Slice.java @@ -205,7 +205,7 @@ private Map makeReplicas(Map genericReplicas) { private Replica findLeader() { for (Replica replica : replicas.values()) { if (replica.getStr(LEADER) != null) { - assert replica.getType() == Type.APPEND || replica.getType() == Type.REALTIME: "Passive replica should not become leader!"; + assert replica.getType() == Type.TLOG || replica.getType() == Type.NRT: "Pull replica should not become leader!"; return replica; } } diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java index 3f6f1accbb09..3c3497af855c 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java @@ -99,9 +99,9 @@ public class ZkStateReader implements Closeable { public static final String AUTO_ADD_REPLICAS = "autoAddReplicas"; public static final String MAX_CORES_PER_NODE = "maxCoresPerNode"; //TODO: Move these constants out of ZkStateReader - public static final String PASSIVE_REPLICAS = "passiveReplicas"; - public static final String REALTIME_REPLICAS = "realtimeReplicas"; - public static final String APPEND_REPLICAS = "appendReplicas"; + public static final String PULL_REPLICAS = "pullReplicas"; + public static final String NRT_REPLICAS = "nrtReplicas"; + public static final String TLOG_REPLICAS = "tlogReplicas"; public static final String ROLES = "/roles.json"; @@ -788,7 +788,7 @@ public List getReplicaProps(String collection, String shardId, public List getReplicaProps(String collection, String shardId, String thisCoreNodeName, Replica.State mustMatchStateFilter, Replica.State mustNotMatchStateFilter) { //nocommit: We don't need all these getReplicaProps method overloading. Also, it's odd that the default is to return replicas of type APPEND and REALTIME only - return getReplicaProps(collection, shardId, thisCoreNodeName, mustMatchStateFilter, null, EnumSet.of(Replica.Type.APPEND, Replica.Type.REALTIME)); + return getReplicaProps(collection, shardId, thisCoreNodeName, mustMatchStateFilter, null, EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT)); } public List getReplicaProps(String collection, String shardId, String thisCoreNodeName, diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java index 7a89decfc7e6..0f9b1475bb0e 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/CollectionAdminRequestRequiredParamsTest.java @@ -81,7 +81,7 @@ public void testAddReplica() { request = new CollectionAdminRequest.AddReplica() .setShardName("shard") .setCollectionName("collection") - .setType(Replica.Type.REALTIME); + .setType(Replica.Type.NRT); assertContainsParams(request.getParams(), ACTION, COLLECTION, SHARD, ZkStateReader.REPLICA_TYPE); } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java index 2d082ea48fe4..f30c035c7e40 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java @@ -236,10 +236,10 @@ public void distribSetUp() throws Exception { CreateMode.PERSISTENT, true); } } - if (useAppendReplicas()) { - log.info("Will use {} replicas unless explicitly asked otherwise", Replica.Type.APPEND); + if (useTlogReplicas()) { + log.info("Will use {} replicas unless explicitly asked otherwise", Replica.Type.TLOG); } else { - log.info("Will use {} replicas unless explicitly asked otherwise", Replica.Type.REALTIME); + log.info("Will use {} replicas unless explicitly asked otherwise", Replica.Type.NRT); } } @@ -281,7 +281,7 @@ protected void initCloud() throws Exception { shardToJetty, shardToLeaderJetty); } - protected boolean useAppendReplicas() { + protected boolean useTlogReplicas() { return false; } @@ -397,13 +397,13 @@ protected List createJettys(int numJettys) throws Exception { "name", DEFAULT_COLLECTION, "numShards", String.valueOf(sliceCount), DocCollection.STATE_FORMAT, getStateFormat(), - ZkStateReader.REALTIME_REPLICAS, useAppendReplicas()?"0":"1", - ZkStateReader.APPEND_REPLICAS, useAppendReplicas()?"1":"0", - ZkStateReader.PASSIVE_REPLICAS, String.valueOf(getPassiveReplicaCount())))); + ZkStateReader.NRT_REPLICAS, useTlogReplicas()?"0":"1", + ZkStateReader.TLOG_REPLICAS, useTlogReplicas()?"1":"0", + ZkStateReader.PULL_REPLICAS, String.valueOf(getPullReplicaCount())))); zkClient.close(); } - int numPassiveReplicas = getPassiveReplicaCount() * sliceCount; + int numPullReplicas = getPullReplicaCount() * sliceCount; for (int i = 1; i <= numJettys; i++) { if (sb.length() > 0) sb.append(','); @@ -415,17 +415,17 @@ ZkStateReader.APPEND_REPLICAS, useAppendReplicas()?"1":"0", setupJettySolrHome(jettyDir); JettySolrRunner j; - if (numPassiveReplicas > 0) { - numPassiveReplicas--; - log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.PASSIVE); + if (numPullReplicas > 0) { + numPullReplicas--; + log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.PULL); j = createJetty(jettyDir, useJettyDataDir ? getDataDir(testDir + "/jetty" - + cnt) : null, null, "solrconfig.xml", null, Replica.Type.PASSIVE); - } else if (useAppendReplicas()) { - log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.APPEND); + + cnt) : null, null, "solrconfig.xml", null, Replica.Type.PULL); + } else if (useTlogReplicas()) { + log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.TLOG); j = createJetty(jettyDir, useJettyDataDir ? getDataDir(testDir + "/jetty" - + cnt) : null, null, "solrconfig.xml", null, Replica.Type.APPEND); + + cnt) : null, null, "solrconfig.xml", null, Replica.Type.TLOG); } else { - log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.REALTIME); + log.info("create jetty {} in directory {} of type {}", i, jettyDir, Replica.Type.NRT); j = createJetty(jettyDir, useJettyDataDir ? getDataDir(testDir + "/jetty" + cnt) : null, null, "solrconfig.xml", null, null); } @@ -477,7 +477,7 @@ ZkStateReader.APPEND_REPLICAS, useAppendReplicas()?"1":"0", } - protected int getPassiveReplicaCount() { + protected int getPullReplicaCount() { return 0; } @@ -547,7 +547,7 @@ public JettySolrRunner createJetty(File solrHome, String dataDir, String shardLi if (replicaType != null) { props.setProperty("replicaType", replicaType.toString()); } else if (random().nextBoolean()) { - props.setProperty("replicaType", Replica.Type.REALTIME.toString()); + props.setProperty("replicaType", Replica.Type.NRT.toString()); } props.setProperty("coreRootDirectory", solrHome.toPath().resolve("cores").toAbsolutePath().toString()); @@ -586,7 +586,7 @@ public JettySolrRunner createProxiedJetty(File solrHome, String dataDir, if (replicaType != null) { props.setProperty("replicaType", replicaType.toString()); } else if (random().nextBoolean()) { - props.setProperty("replicaType", Replica.Type.REALTIME.toString()); + props.setProperty("replicaType", Replica.Type.NRT.toString()); } props.setProperty("coreRootDirectory", solrHome.toPath().resolve("cores").toAbsolutePath().toString()); @@ -1616,23 +1616,23 @@ protected CollectionAdminResponse createCollection(Map> co String shardNames = (String) collectionProps.get(SHARDS_PROP); numShards = StrUtils.splitSmart(shardNames,',').size(); } - Integer numRealtimeReplicas = (Integer) collectionProps.get(ZkStateReader.REALTIME_REPLICAS); - if (numRealtimeReplicas == null) { - numRealtimeReplicas = (Integer) collectionProps.get(ZkStateReader.REPLICATION_FACTOR); + Integer numNrtReplicas = (Integer) collectionProps.get(ZkStateReader.NRT_REPLICAS); + if (numNrtReplicas == null) { + numNrtReplicas = (Integer) collectionProps.get(ZkStateReader.REPLICATION_FACTOR); } - if(numRealtimeReplicas == null){ - numRealtimeReplicas = (Integer) OverseerCollectionMessageHandler.COLL_PROPS.get(ZkStateReader.REPLICATION_FACTOR); + if(numNrtReplicas == null){ + numNrtReplicas = (Integer) OverseerCollectionMessageHandler.COLL_PROPS.get(ZkStateReader.REPLICATION_FACTOR); } - if (numRealtimeReplicas == null) { - numRealtimeReplicas = Integer.valueOf(0); + if (numNrtReplicas == null) { + numNrtReplicas = Integer.valueOf(0); } - Integer numAppendReplicas = (Integer) collectionProps.get(ZkStateReader.APPEND_REPLICAS); - if (numAppendReplicas == null) { - numAppendReplicas = Integer.valueOf(0); + Integer numTlogReplicas = (Integer) collectionProps.get(ZkStateReader.TLOG_REPLICAS); + if (numTlogReplicas == null) { + numTlogReplicas = Integer.valueOf(0); } - Integer numPassiveReplicas = (Integer) collectionProps.get(ZkStateReader.PASSIVE_REPLICAS); - if (numPassiveReplicas == null) { - numPassiveReplicas = Integer.valueOf(0); + Integer numPullReplicas = (Integer) collectionProps.get(ZkStateReader.PULL_REPLICAS); + if (numPullReplicas == null) { + numPullReplicas = Integer.valueOf(0); } if (confSetName != null) { params.set("collection.configName", confSetName); @@ -1641,7 +1641,7 @@ protected CollectionAdminResponse createCollection(Map> co int clientIndex = random().nextInt(2); List list = new ArrayList<>(); list.add(numShards); - list.add(numRealtimeReplicas + numAppendReplicas + numPassiveReplicas); + list.add(numNrtReplicas + numTlogReplicas + numPullReplicas); if (collectionInfos != null) { collectionInfos.put(collectionName, list); } @@ -1669,14 +1669,14 @@ protected CollectionAdminResponse createCollection(Map> co protected CollectionAdminResponse createCollection(Map> collectionInfos, String collectionName, int numShards, int replicationFactor, int maxShardsPerNode, SolrClient client, String createNodeSetStr) throws SolrServerException, IOException { - int numRealtimeReplicas = useAppendReplicas()?0:replicationFactor; - int numAppendReplicas = useAppendReplicas()?replicationFactor:0; + int numNrtReplicas = useTlogReplicas()?0:replicationFactor; + int numTlogReplicas = useTlogReplicas()?replicationFactor:0; return createCollection(collectionInfos, collectionName, Utils.makeMap( NUM_SLICES, numShards, - ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, - ZkStateReader.APPEND_REPLICAS, numAppendReplicas, - ZkStateReader.PASSIVE_REPLICAS, getPassiveReplicaCount(), + ZkStateReader.NRT_REPLICAS, numNrtReplicas, + ZkStateReader.TLOG_REPLICAS, numTlogReplicas, + ZkStateReader.PULL_REPLICAS, getPullReplicaCount(), CREATE_NODE_SET, createNodeSetStr, ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode), client); @@ -1685,14 +1685,14 @@ ZkStateReader.PASSIVE_REPLICAS, getPassiveReplicaCount(), protected CollectionAdminResponse createCollection(Map> collectionInfos, String collectionName, int numShards, int replicationFactor, int maxShardsPerNode, SolrClient client, String createNodeSetStr, String configName) throws SolrServerException, IOException { - int numRealtimeReplicas = useAppendReplicas()?0:replicationFactor; - int numAppendReplicas = useAppendReplicas()?replicationFactor:0; + int numNrtReplicas = useTlogReplicas()?0:replicationFactor; + int numTlogReplicas = useTlogReplicas()?replicationFactor:0; return createCollection(collectionInfos, collectionName, Utils.makeMap( NUM_SLICES, numShards, - ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, - ZkStateReader.APPEND_REPLICAS, numAppendReplicas, - ZkStateReader.PASSIVE_REPLICAS, getPassiveReplicaCount(), + ZkStateReader.NRT_REPLICAS, numNrtReplicas, + ZkStateReader.TLOG_REPLICAS, numTlogReplicas, + ZkStateReader.PULL_REPLICAS, getPullReplicaCount(), CREATE_NODE_SET, createNodeSetStr, ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode), client, configName); @@ -1873,13 +1873,13 @@ protected void createCollection(String collName, int numShards ) throws Exception { int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrClient() .getZkStateReader().getClusterState().getLiveNodes().size())) + 1; - int numRealtimeReplicas = useAppendReplicas()?0:replicationFactor; - int numAppendReplicas = useAppendReplicas()?replicationFactor:0; + int numNrtReplicas = useTlogReplicas()?0:replicationFactor; + int numTlogReplicas = useTlogReplicas()?replicationFactor:0; Map props = makeMap( ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode, - ZkStateReader.REALTIME_REPLICAS, numRealtimeReplicas, - ZkStateReader.APPEND_REPLICAS, numAppendReplicas, - ZkStateReader.PASSIVE_REPLICAS, getPassiveReplicaCount(), + ZkStateReader.NRT_REPLICAS, numNrtReplicas, + ZkStateReader.TLOG_REPLICAS, numTlogReplicas, + ZkStateReader.PULL_REPLICAS, getPullReplicaCount(), NUM_SLICES, numShards); Map> collectionInfos = new HashMap<>(); createCollection(collectionInfos, collName, props, client); @@ -2078,24 +2078,24 @@ protected void waitForReplicationFromReplicas(String collectionName, ZkStateRead if (timeout.hasTimedOut()) { fail("Unable to get leader indexVersion"); } - for (Replica passiveReplica:s.getReplicas(EnumSet.of(Replica.Type.PASSIVE,Replica.Type.APPEND))) { - if (!zkStateReader.getClusterState().liveNodesContain(passiveReplica.getNodeName())) { + for (Replica pullReplica:s.getReplicas(EnumSet.of(Replica.Type.PULL,Replica.Type.TLOG))) { + if (!zkStateReader.getClusterState().liveNodesContain(pullReplica.getNodeName())) { continue; } while (true) { - long replicaIndexVersion = getIndexVersion(passiveReplica); + long replicaIndexVersion = getIndexVersion(pullReplica); if (leaderIndexVersion == replicaIndexVersion) { - log.debug("Leader replica's version ({}) in sync with replica({}): {} == {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); + log.debug("Leader replica's version ({}) in sync with replica({}): {} == {}", leader.getName(), pullReplica.getName(), leaderIndexVersion, replicaIndexVersion); break; } else { if (timeout.hasTimedOut()) { logReplicaTypesReplicationInfo(collectionName, zkStateReader); - fail(String.format(Locale.ROOT, "Timed out waiting for replica %s (%d) to replicate from leader %s (%d)", passiveReplica.getName(), replicaIndexVersion, leader.getName(), leaderIndexVersion)); + fail(String.format(Locale.ROOT, "Timed out waiting for replica %s (%d) to replicate from leader %s (%d)", pullReplica.getName(), replicaIndexVersion, leader.getName(), leaderIndexVersion)); } if (leaderIndexVersion > replicaIndexVersion) { - log.debug("{} version is {} and leader's is {}, will wait for replication", passiveReplica.getName(), replicaIndexVersion, leaderIndexVersion); + log.debug("{} version is {} and leader's is {}, will wait for replication", pullReplica.getName(), replicaIndexVersion, leaderIndexVersion); } else { - log.debug("Leader replica's version ({}) is lower than passive replica({}): {} < {}", leader.getName(), passiveReplica.getName(), leaderIndexVersion, replicaIndexVersion); + log.debug("Leader replica's version ({}) is lower than pull replica({}): {} < {}", leader.getName(), pullReplica.getName(), leaderIndexVersion, replicaIndexVersion); } Thread.sleep(1000); } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java index 355e6016307f..bdbbdd2b6f80 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/ChaosMonkey.java @@ -379,7 +379,7 @@ public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLea boolean canKillIndexer = canKillIndexer(slice); if (!canKillIndexer) { - monkeyLog("Number of indexer nodes (realtime or append) is not enough to kill one of them, Will only choose a passive replica to kill"); + monkeyLog("Number of indexer nodes (nrt or tlog replicas) is not enough to kill one of them, Will only choose a pull replica to kill"); } int chance = chaosRandom.nextInt(10); @@ -395,10 +395,10 @@ public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLea attempt++; int index = chaosRandom.nextInt(jetties.size()); cjetty = jetties.get(index); - if (canKillIndexer || getTypeForJetty(slice, cjetty) == Replica.Type.PASSIVE) { + if (canKillIndexer || getTypeForJetty(slice, cjetty) == Replica.Type.PULL) { break; } else if (attempt > 20) { - monkeyLog("Can't kill indexer nodes (realtime or append) and couldn't find a random passive node after 20 attempts - monkey cannot kill :("); + monkeyLog("Can't kill indexer nodes (nrt or tlog replicas) and couldn't find a random pull node after 20 attempts - monkey cannot kill :("); return null; } } @@ -481,7 +481,7 @@ private boolean canKillIndexer(String sliceName) throws KeeperException, Interru if (cloudJetty.jetty.isRunning() && state == Replica.State.ACTIVE - && (replicaType == Replica.Type.APPEND || replicaType == Replica.Type.REALTIME) + && (replicaType == Replica.Type.TLOG || replicaType == Replica.Type.NRT) && zkStateReader.getClusterState().liveNodesContain(nodeName)) { numIndexersFoundInShard++; } From 14ea4ad8fa262963cfb969634413411c61222165 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Fri, 12 May 2017 14:49:08 -0700 Subject: [PATCH 28/41] Revert "Temporary commit for SOLR-10524" This reverts commit b16f0e3531bce9ca600bb53d4bea908c269ce0d1. --- .../src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java | 3 --- .../test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java | 2 -- 2 files changed, 5 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java index 1eddeaa7579f..c7e8c523961a 100644 --- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java +++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java @@ -166,9 +166,6 @@ public ClusterState enqueueUpdate(ClusterState prevState, ZkWriteCommand cmd, Zk * @return true if a flush is required, false otherwise */ protected boolean maybeFlushBefore(ZkWriteCommand cmd) { - if (cmd.collection == null) { - return false; - } return cmd.collection.getStateFormat() != lastStateFormat; } diff --git a/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java b/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java index 729857f17eff..85dbf4aba720 100644 --- a/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateWriterTest.java @@ -35,7 +35,6 @@ import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.util.Utils; import org.apache.zookeeper.KeeperException; -import org.junit.Ignore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,7 +42,6 @@ public class ZkStateWriterTest extends SolrTestCaseJ4 { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - @Ignore public void testZkStateWriterBatching() throws Exception { String zkDir = createTempDir("testZkStateWriterBatching").toFile().getAbsolutePath(); From e927db5ad0cb425b8b1f14f8183235d5628412f6 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Fri, 12 May 2017 17:52:50 -0700 Subject: [PATCH 29/41] PrepRecovery throws NPE when core is not found --- .../solr/handler/admin/PrepRecoveryOp.java | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java index 39892b0e6b66..748982d79f9e 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java @@ -64,18 +64,20 @@ public void execute(CallInfo it) throws Exception { Boolean onlyIfLeader = params.getBool("onlyIfLeader"); Boolean onlyIfLeaderActive = params.getBool("onlyIfLeaderActive"); - log.info("Going to wait for coreNodeName: " + coreNodeName + ", state: " + waitForState - + ", checkLive: " + checkLive + ", onlyIfLeader: " + onlyIfLeader - + ", onlyIfLeaderActive: " + onlyIfLeaderActive); - int maxTries = 0; + CoreContainer coreContainer = it.handler.coreContainer; + // wait long enough for the leader conflict to work itself out plus a little extra + int conflictWaitMs = coreContainer.getZkController().getLeaderConflictResolveWait(); + int maxTries = (int) Math.round(conflictWaitMs / 1000) + 3; + log.info("Going to wait for coreNodeName: {}, state: {}, checkLive: {}, onlyIfLeader: {}, onlyIfLeaderActive: {}, maxTime: {} s", + coreNodeName, waitForState, checkLive, onlyIfLeader, onlyIfLeaderActive, maxTries); + Replica.State state = null; boolean live = false; int retry = 0; while (true) { - CoreContainer coreContainer = it.handler.coreContainer; try (SolrCore core = coreContainer.getCore(cname)) { - if (core == null && retry == 30) { + if (core == null && retry == Math.min(30, maxTries)) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "core not found:" + cname); } @@ -103,15 +105,6 @@ public void execute(CallInfo it) throws Exception { coreContainer.getZkController().getZkStateReader().forceUpdateCollection(collectionName); } - if (maxTries == 0) { - // wait long enough for the leader conflict to work itself out plus a little extra - int conflictWaitMs = coreContainer.getZkController().getLeaderConflictResolveWait(); - maxTries = (int) Math.round(conflictWaitMs / 1000) + 3; - log.info("Will wait a max of " + maxTries + " seconds to see " + cname + " (" + - cloudDescriptor.getShardId() + " of " + - cloudDescriptor.getCollectionName() + ") have state: " + waitForState); - } - ClusterState clusterState = coreContainer.getZkController().getClusterState(); DocCollection collection = clusterState.getCollection(collectionName); Slice slice = collection.getSlice(cloudDescriptor.getShardId()); @@ -161,6 +154,7 @@ public void execute(CallInfo it) throws Exception { String collection = null; String leaderInfo = null; String shardId = null; + try { CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor(); @@ -176,8 +170,7 @@ public void execute(CallInfo it) throws Exception { "I was asked to wait on state " + waitForState + " for " + shardId + " in " + collection + " on " + nodeName + " but I still do not see the requested state. I see state: " - + Objects.toString(state) + " live:" + live + " leader from ZK: " + leaderInfo - ); + + Objects.toString(state) + " live:" + live + " leader from ZK: " + leaderInfo); } if (coreContainer.isShutDown()) { From 659a0b4125559225e8c68efdabc8b670adf0715d Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Fri, 12 May 2017 17:53:22 -0700 Subject: [PATCH 30/41] Add support for ReplaceNodeCmd --- .../org/apache/solr/cloud/ReplaceNodeCmd.java | 4 ++-- .../apache/solr/cloud/ReplaceNodeTest.java | 21 +++++++++++++++++-- .../solrj/request/CollectionAdminRequest.java | 5 ++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java b/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java index 92c9afedcc55..e4240be01882 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java @@ -154,9 +154,9 @@ static List getReplicasOfNode(String source, ClusterState state) { SHARD_ID_PROP, slice.getName(), ZkStateReader.CORE_NAME_PROP, replica.getCoreName(), ZkStateReader.REPLICA_PROP, replica.getName(), + ZkStateReader.REPLICA_TYPE, replica.getType().name(), CoreAdminParams.NODE, source); - sourceReplicas.add(props - ); + sourceReplicas.add(props); } } } diff --git a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java index 1c7575d52257..e1af6075b647 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ReplaceNodeTest.java @@ -21,6 +21,7 @@ import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collections; +import java.util.EnumSet; import java.util.Set; import org.apache.solr.client.solrj.impl.CloudSolrClient; @@ -29,6 +30,9 @@ import org.apache.solr.client.solrj.request.CoreAdminRequest; import org.apache.solr.client.solrj.response.CoreAdminResponse; import org.apache.solr.client.solrj.response.RequestStatusState; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.util.StrUtils; import org.junit.BeforeClass; import org.junit.Test; @@ -60,7 +64,12 @@ public void test() throws Exception { Collections.shuffle(l, random()); String emptyNode = l.remove(0); String node2bdecommissioned = l.get(0); - CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(coll, "conf1", 5, 2); + CollectionAdminRequest.Create create; + create = pickRandom(CollectionAdminRequest.createCollection(coll, "conf1", 5, 2), + CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,1,0), + CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,1,1), + CollectionAdminRequest.createCollection(coll, "conf1", 5, 1,0,1), + CollectionAdminRequest.createCollection(coll, "conf1", 5, 0,2,0)); create.setCreateNodeSet(StrUtils.join(l, ',')).setMaxShardsPerNode(3); cloudClient.request(create); log.info("excluded_node : {} ", emptyNode); @@ -98,7 +107,15 @@ public void test() throws Exception { assertTrue(success); try (HttpSolrClient coreclient = getHttpSolrClient(cloudClient.getZkStateReader().getBaseUrlForNodeName(emptyNode))) { CoreAdminResponse status = CoreAdminRequest.getStatus(null, coreclient); - assertTrue(status.getCoreStatus().size() == 0); + assertEquals("Expecting no cores but found some: " + status.getCoreStatus(), 0, status.getCoreStatus().size()); + } + + DocCollection collection = cloudClient.getZkStateReader().getClusterState().getCollection(coll); + assertEquals(create.getNumShards().intValue(), collection.getSlices().size()); + for (Slice s:collection.getSlices()) { + assertEquals(create.getNumNrtReplicas().intValue(), s.getReplicas(EnumSet.of(Replica.Type.NRT)).size()); + assertEquals(create.getNumTlogReplicas().intValue(), s.getReplicas(EnumSet.of(Replica.Type.TLOG)).size()); + assertEquals(create.getNumPullReplicas().intValue(), s.getReplicas(EnumSet.of(Replica.Type.PULL)).size()); } } } diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java index 0b8dd74ab46e..dcfed27650c8 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java @@ -457,9 +457,8 @@ private Create(String collection, String config, String shards, int numNrtReplic public Integer getReplicationFactor() { return getNumNrtReplicas(); } public Integer getNumNrtReplicas() { return nrtReplicas; } public Boolean getAutoAddReplicas() { return autoAddReplicas; } - public Integer getNrtReplicas() { return nrtReplicas; } - public Integer getTlogReplicas() {return tlogReplicas;} - public Integer getPullReplicas() {return pullReplicas;} + public Integer getNumTlogReplicas() {return tlogReplicas;} + public Integer getNumPullReplicas() {return pullReplicas;} public Integer getStateFormat() { return stateFormat; } From bcdb827036677d703897e7df40baab13cc665f27 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Fri, 12 May 2017 18:13:53 -0700 Subject: [PATCH 31/41] Include replica types in DeleteNodeTest --- .../test/org/apache/solr/cloud/DeleteNodeTest.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java index 8d2f6f25a783..9f461f0e7f00 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DeleteNodeTest.java @@ -18,7 +18,6 @@ package org.apache.solr.cloud; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collections; import java.util.Set; @@ -29,12 +28,9 @@ import org.apache.solr.common.util.StrUtils; import org.junit.BeforeClass; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class DeleteNodeTest extends SolrCloudTestCase { - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - + @BeforeClass public static void setupCluster() throws Exception { configureCluster(6) @@ -54,7 +50,10 @@ public void test() throws Exception { Set liveNodes = cloudClient.getZkStateReader().getClusterState().getLiveNodes(); ArrayList l = new ArrayList<>(liveNodes); Collections.shuffle(l, random()); - CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(coll, "conf1", 5, 2); + CollectionAdminRequest.Create create = pickRandom( + CollectionAdminRequest.createCollection(coll, "conf1", 5, 2, 0, 0), + CollectionAdminRequest.createCollection(coll, "conf1", 5, 1, 1, 0), + CollectionAdminRequest.createCollection(coll, "conf1", 5, 0, 1, 1)); create.setCreateNodeSet(StrUtils.join(l, ',')).setMaxShardsPerNode(3); cloudClient.request(create); String node2bdecommissioned = l.get(0); From 2bd43abce27fceb80a1b0809a797ccd571f3e108 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 16 May 2017 15:46:23 -0700 Subject: [PATCH 32/41] Include replica types in delete replicas by count --- .../apache/solr/cloud/DeleteReplicaTest.java | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java index 5699a8f3bf5d..4c6253e3f01a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java @@ -19,9 +19,11 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.EnumSet; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.CoreStatus; +import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; @@ -101,11 +103,29 @@ public void deleteReplicaAndVerifyDirectoryCleanup() throws Exception { public void deleteReplicaByCount() throws Exception { final String collectionName = "deleteByCount"; - CollectionAdminRequest.createCollection(collectionName, "conf", 1, 3).process(cluster.getSolrClient()); + pickRandom( + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 3), + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 1, 1), + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, 2), + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 0, 1, 2)) + .process(cluster.getSolrClient()); waitForState("Expected a single shard with three replicas", collectionName, clusterShape(1, 3)); CollectionAdminRequest.deleteReplicasFromShard(collectionName, "shard1", 2).process(cluster.getSolrClient()); waitForState("Expected a single shard with a single replica", collectionName, clusterShape(1, 1)); + + try { + CollectionAdminRequest.deleteReplicasFromShard(collectionName, "shard1", 1).process(cluster.getSolrClient()); + fail("Expected Exception, Can't delete the last replica by count"); + } catch (SolrException e) { + // expected + assertEquals(SolrException.ErrorCode.BAD_REQUEST.code, e.code()); + assertTrue(e.getMessage().contains("There is only one replica available")); + } + DocCollection docCollection = getCollectionState(collectionName); + // We know that since leaders are preserved, PULL replicas should not be left alone in the shard + assertEquals(0, docCollection.getSlice("shard1").getReplicas(EnumSet.of(Replica.Type.PULL)).size()); + } From bc974c17d9c7fe3dc6ef2e09389bb81960e7c8d6 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 16 May 2017 15:48:20 -0700 Subject: [PATCH 33/41] Temporary commit to prevent tests to hang (as proposed in SOLR-9824) --- .../impl/ConcurrentUpdateSolrClient.java | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java index fa9350321e20..bc37c130ded5 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/ConcurrentUpdateSolrClient.java @@ -449,9 +449,12 @@ private void addRunner() { try { Runner r = new Runner(); runners.add(r); - - scheduler.execute(r); // this can throw an exception if the scheduler has been shutdown, but that should be fine. - + try { + scheduler.execute(r); // this can throw an exception if the scheduler has been shutdown, but that should be fine. + } catch (RuntimeException e) { + runners.remove(r); + throw e; + } } finally { MDC.remove("ConcurrentUpdateSolrClient.url"); } @@ -640,9 +643,15 @@ public synchronized void blockUntilFinished() { } private void waitForEmptyQueue() { + boolean threadInterrupted = Thread.currentThread().isInterrupted(); while (!queue.isEmpty()) { if (log.isDebugEnabled()) emptyQueueLoops.incrementAndGet(); + if (scheduler.isTerminated()) { + log.warn("The task queue still has elements but the update scheduler {} is terminated. Can't process any more tasks. " + + "Queue size: {}, Runners: {}. Current thread Interrupted? {}", scheduler, queue.size(), runners.size(), threadInterrupted); + break; + } synchronized (runners) { int queueSize = queue.size(); @@ -656,10 +665,16 @@ private void waitForEmptyQueue() { try { queue.wait(250); } catch (InterruptedException e) { - Thread.currentThread().interrupt(); + // If we set the thread as interrupted again, the next time the wait it's called i t's going to return immediately + threadInterrupted = true; + log.warn("Thread interrupted while waiting for update queue to be empty. There are still {} elements in the queue.", + queue.size()); } } } + if (threadInterrupted) { + Thread.currentThread().interrupt(); + } } public void handleError(Throwable ex) { From caad85c93eedf6b7019118005b34ee77e0196237 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Tue, 16 May 2017 15:49:13 -0700 Subject: [PATCH 34/41] Extra debugging on the client code --- .../org/apache/solr/client/solrj/impl/CloudSolrClient.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index b7ffa1dcd41a..a9c6d6d7a99b 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -1325,6 +1325,7 @@ protected NamedList sendRequest(SolrRequest request, String collection) ClientUtils.addSlices(slices, collectionName, routeSlices, true); } Set liveNodes = stateProvider.liveNodes(); + log.debug("Live Nodes: {}", liveNodes);//nocommit List leaderUrlList = null; List urlList = null; @@ -1339,7 +1340,10 @@ protected NamedList sendRequest(SolrRequest request, String collection) ZkCoreNodeProps coreNodeProps = new ZkCoreNodeProps(nodeProps); String node = coreNodeProps.getNodeName(); if (!liveNodes.contains(coreNodeProps.getNodeName()) - || Replica.State.getState(coreNodeProps.getState()) != Replica.State.ACTIVE) continue; + || Replica.State.getState(coreNodeProps.getState()) != Replica.State.ACTIVE) { + log.debug("{} not in liveNodes, skipping", coreNodeProps.getNodeName());//nocommit + continue; + } if (nodes.put(node, nodeProps) == null) { if (!sendToLeaders || coreNodeProps.isLeader()) { String url; From a8bac800a30d8434208e33912e388b0fd9547818 Mon Sep 17 00:00:00 2001 From: Tomas Fernandez Lobbe Date: Wed, 17 May 2017 16:08:04 -0700 Subject: [PATCH 35/41] Removed some TODOs and nocommits. Some improvements to tests and other minor changes --- .../org/apache/solr/cloud/ZkController.java | 2 - .../org/apache/solr/handler/IndexFetcher.java | 8 +- .../solr/handler/RealTimeGetHandler.java | 4 +- .../solr/handler/ReplicationHandler.java | 2 +- .../handler/component/HttpShardHandler.java | 12 ++- .../component/RealTimeGetComponent.java | 2 +- .../org/apache/solr/update/UpdateHandler.java | 2 +- .../processor/DistributedUpdateProcessor.java | 11 ++- .../org/apache/solr/util/TestInjection.java | 5 +- .../solr/collection1/conf/solrconfig.xml | 8 +- ...nkeyNothingIsSafeWithPullReplicasTest.java | 17 +--- ...sMonkeySafeLeaderWithPullReplicasTest.java | 5 +- .../org/apache/solr/cloud/ShardSplitTest.java | 5 -- .../apache/solr/cloud/TestPullReplica.java | 6 +- .../apache/solr/cloud/TestTlogReplica.java | 83 ++++++++++++------- .../solrj/request/CollectionAdminRequest.java | 2 +- .../solr/common/cloud/ZkStateReader.java | 3 +- .../cloud/AbstractFullDistribZkTestBase.java | 11 +++ 18 files changed, 115 insertions(+), 73 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 53d67471b734..cb8175ea6b3e 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -1302,8 +1302,6 @@ public void unregister(String coreName, CoreDescriptor cd) throws InterruptedExc context.cancelElection(); } } -// //TODO: Do we need to stop replication for type==tlog? - CloudDescriptor cloudDescriptor = cd.getCloudDescriptor(); zkStateReader.unregisterCore(cloudDescriptor.getCollectionName()); diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java index 84b36b5bb0d6..7d15701e3969 100644 --- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java +++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java @@ -178,11 +178,13 @@ public static class IndexFetchResult { public static final IndexFetchResult INDEX_FETCH_FAILURE = new IndexFetchResult("Fetching lastest index is failed", false, null); public static final IndexFetchResult INDEX_FETCH_SUCCESS = new IndexFetchResult("Fetching latest index is successful", true, null); public static final IndexFetchResult LOCK_OBTAIN_FAILED = new IndexFetchResult("Obtaining SnapPuller lock failed", false, null); + public static final IndexFetchResult CONTAINER_IS_SHUTTING_DOWN = new IndexFetchResult("I was asked to replicate but CoreContainer is shutting down", false, null); public static final IndexFetchResult MASTER_VERSION_ZERO = new IndexFetchResult("Index in peer is empty and never committed yet", true, null); public static final IndexFetchResult NO_INDEX_COMMIT_EXIST = new IndexFetchResult("No IndexCommit in local index", false, null); public static final IndexFetchResult PEER_INDEX_COMMIT_DELETED = new IndexFetchResult("No files to download because IndexCommit in peer was deleted", false, null); public static final IndexFetchResult LOCAL_ACTIVITY_DURING_REPLICATION = new IndexFetchResult("Local index modification during replication", false, null); public static final IndexFetchResult EXPECTING_NON_LEADER = new IndexFetchResult("Replicating from leader but I'm the shard leader", false, null); + public static final IndexFetchResult LEADER_IS_NOT_ACTIVE = new IndexFetchResult("Replicating from leader but leader is not active", false, null); IndexFetchResult(String message, boolean successful, Throwable exception) { this.message = message; @@ -365,7 +367,11 @@ IndexFetchResult fetchLatestIndex(boolean forceReplication, boolean forceCoreRel } if (replica.getState() != Replica.State.ACTIVE) { LOG.info("Replica {} is leader but it's state is {}, skipping replication", replica.getName(), replica.getState()); - return IndexFetchResult.EXPECTING_NON_LEADER;//nocommit: not the correct error + return IndexFetchResult.LEADER_IS_NOT_ACTIVE; + } + if (!solrCore.getCoreContainer().getZkController().getClusterState().liveNodesContain(replica.getNodeName())) { + LOG.info("Replica {} is leader but it's not hosted on a live node, skipping replication", replica.getName()); + return IndexFetchResult.LEADER_IS_NOT_ACTIVE; } if (!replica.getCoreUrl().equals(masterUrl)) { masterUrl = replica.getCoreUrl(); diff --git a/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java b/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java index 247b65cd1420..9f2b693a0a11 100644 --- a/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/RealTimeGetHandler.java @@ -22,6 +22,7 @@ import org.apache.solr.api.Api; import org.apache.solr.api.ApiBag; +import org.apache.solr.handler.component.HttpShardHandler; import org.apache.solr.handler.component.RealTimeGetComponent; import org.apache.solr.handler.component.SearchHandler; import org.apache.solr.request.SolrQueryRequest; @@ -40,7 +41,8 @@ protected List getDefaultComponents() @Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { - req.getContext().put("distribOnlyRealtime", Boolean.TRUE); + // Tell HttpShardHandlerthat this request should only be distributed to NRT replicas + req.getContext().put(HttpShardHandler.ONLY_NRT_REPLICAS, Boolean.TRUE); super.handleRequestBody(req, rsp); } diff --git a/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java b/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java index 90d33bee0dc8..f3dcdeb92373 100644 --- a/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java @@ -405,7 +405,7 @@ public IndexFetchResult doFetch(SolrParams solrParams, boolean forceReplication) return IndexFetchResult.LOCK_OBTAIN_FAILED; if (core.getCoreContainer().isShutDown()) { LOG.warn("I was asked to replicate but CoreContainer is shutting down"); - return IndexFetchResult.LOCK_OBTAIN_FAILED;//nocommit: different + return IndexFetchResult.CONTAINER_IS_SHUTTING_DOWN; } try { if (masterUrl != null) { diff --git a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java index a2108be7001f..4ec3b7924f4f 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java @@ -60,6 +60,14 @@ import org.slf4j.MDC; public class HttpShardHandler extends ShardHandler { + + /** + * If the request context map has an entry with this key and Boolean.TRUE as value, + * {@link #prepDistributed(ResponseBuilder)} will only include {@link org.apache.solr.common.cloud.Replica.Type#NRT} replicas as possible + * destination of the distributed request (or a leader replica of type {@link org.apache.solr.common.cloud.Replica.Type#TLOG}). This is used + * by the RealtimeGet handler, since other types of replicas shouldn't respond to RTG requests + */ + public static String ONLY_NRT_REPLICAS = "distribOnlyRealtime"; private HttpShardHandlerFactory httpShardHandlerFactory; private CompletionService completionService; @@ -349,8 +357,8 @@ public void prepDistributed(ResponseBuilder rb) { // and make it a non-distributed request. String ourSlice = cloudDescriptor.getShardId(); String ourCollection = cloudDescriptor.getCollectionName(); - // Some requests may only be fulfilled by replicas of type Replica.Type.REALTIME - boolean onlyNrtReplicas = Boolean.TRUE == req.getContext().get("distribOnlyRealtime"); + // Some requests may only be fulfilled by replicas of type Replica.Type.NRT + boolean onlyNrtReplicas = Boolean.TRUE == req.getContext().get(ONLY_NRT_REPLICAS); if (rb.slices.length == 1 && rb.slices[0] != null && ( rb.slices[0].equals(ourSlice) || rb.slices[0].equals(ourCollection + "_" + ourSlice) ) // handle the _ format && cloudDescriptor.getLastPublished() == Replica.State.ACTIVE diff --git a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java index 676070586dd7..6d70435612f1 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/RealTimeGetComponent.java @@ -117,7 +117,7 @@ public void process(ResponseBuilder rb) throws IOException cloudDesc.getCoreNodeName(), Replica.Type.PULL)); } - // non-leader APPEND replicas should not respond to distrib /get requests, but internal requests are OK + // non-leader TLOG replicas should not respond to distrib /get requests, but internal requests are OK } } diff --git a/solr/core/src/java/org/apache/solr/update/UpdateHandler.java b/solr/core/src/java/org/apache/solr/update/UpdateHandler.java index 42abaf8f9e04..f0eb8bc7e127 100644 --- a/solr/core/src/java/org/apache/solr/update/UpdateHandler.java +++ b/solr/core/src/java/org/apache/solr/update/UpdateHandler.java @@ -126,7 +126,7 @@ public UpdateHandler(SolrCore core, UpdateLog updateLog) { // If this is a replica of type PULL, don't create the update log boolean skipUpdateLog = core.getCoreDescriptor().getCloudDescriptor() != null && !core.getCoreDescriptor().getCloudDescriptor().requiresTransactionLog(); - if (updateLog == null && ulogPluginInfo != null && !skipUpdateLog) { + if (updateLog == null && ulogPluginInfo != null && ulogPluginInfo.isEnabled() && !skipUpdateLog) { String dataDir = (String)ulogPluginInfo.initArgs.get("dir"); String ulogDir = core.getCoreDescriptor().getUlogDir(); diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java index e67f98219fc5..a91831cd16e4 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java @@ -1609,7 +1609,7 @@ public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException { String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId(); Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry( collection, myShardId); - // DBQ forwarded to Realtime and Append + // DBQ forwarded to NRT and TLOG replicas List replicaProps = zkController.getZkStateReader() .getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG)); if (replicaProps != null) { @@ -1700,7 +1700,7 @@ protected void versionDeleteByQuery(DeleteUpdateCommand cmd) throws IOException } if (replicaType == Replica.Type.TLOG && (cmd.getFlags() & UpdateCommand.REPLAY) == 0) { - // Append replica not leader, don't write the DBQ to IW + // TLOG replica not leader, don't write the DBQ to IW cmd.setFlags(cmd.getFlags() | UpdateCommand.IGNORE_INDEXWRITER); } doLocalDelete(cmd); @@ -1896,7 +1896,7 @@ public void processCommit(CommitUpdateCommand cmd) throws IOException { } if (!zkEnabled || req.getParams().getBool(COMMIT_END_POINT, false) || singleLeader) { - if (replicaType == Replica.Type.TLOG) { // REALTIME will always commit + if (replicaType == Replica.Type.TLOG) { try { Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry( collection, cloudDesc.getShardId()); @@ -1914,7 +1914,12 @@ public void processCommit(CommitUpdateCommand cmd) throws IOException { } catch (InterruptedException e) { throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + cloudDesc.getShardId(), e); } + } else if (replicaType == Replica.Type.PULL) { + log.warn("Commit not supported on replicas of type " + Replica.Type.PULL); } else { + // NRT replicas will always commit + long commitVersion = vinfo.getNewClock(); + cmd.setVersion(commitVersion); doLocalCommit(cmd); } } else { diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java index 7b3015194612..656428b5f82c 100644 --- a/solr/core/src/java/org/apache/solr/util/TestInjection.java +++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java @@ -385,13 +385,14 @@ public static boolean waitForInSyncWithLeader(SolrCore core, ZkController zkCont long leaderVersion = (long) ((NamedList)response.get("details")).get("indexVersion"); RefCounted searcher = core.getSearcher(); try { - String localVersion = searcher.get().getIndexReader().getIndexCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); +// String localVersion = searcher.get().getIndexReader().getIndexCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); + String localVersion = searcher.get().getIndexReader().getIndexCommit().getUserData().get(SolrIndexWriter.COMMIT_COMMAND_VERSION); if (localVersion == null && leaderVersion == 0 && !core.getUpdateHandler().getUpdateLog().hasUncommittedChanges()) return true; if (localVersion != null && Long.parseLong(localVersion) == leaderVersion && (leaderVersion >= t || i >= 6)) { log.info("Waiting time for tlog replica to be in sync with leader: {}", System.currentTimeMillis()-currentTime); return true; } else { - log.debug("Append replica not in sync with leader yet. Attempt: {}", i); + log.debug("Tlog replica not in sync with leader yet. Attempt: {}. Local Version={}, leader Version={}", i, localVersion, leaderVersion); Thread.sleep(500); } } finally { diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml index 58f9551b915c..a63f6cb17514 100644 --- a/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml @@ -63,6 +63,10 @@ + + ${solr.autoCommit.maxTime:-1} + +