From 657ea504537f7927a1e1caa12c0ef0064b4a8bd8 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Tue, 18 May 2021 17:42:35 -0600 Subject: [PATCH 1/5] SOLR-15399: Pull replica shouldn't commit locally in IndexFetcher --- solr/CHANGES.txt | 3 +++ .../org/apache/solr/cloud/ReplicateFromLeader.java | 14 +++++++++++++- .../org/apache/solr/cloud/TestPullReplica.java | 7 +------ .../apache/solr/cloud/TestPullReplicaWithAuth.java | 3 ++- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 8e07aaba374..a2f3647f7e0 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -397,6 +397,9 @@ Bug Fixes * SOLR-11904: Mark ReplicationHandler's polling thread as a Solr server thread so the PKI Interceptor is activated to allow PULL replicas to replicate from security-enabled leaders (Timothy Potter, Torsten Bøgh Köster) +* SOLR-15399: Fix flaky TestPullReplica test by ensuring Pull replicas do not commit locally when the leader version + is zero (Timothy Potter) + Other Changes --------------------- * SOLR-15118: Deprecate CollectionAdminRequest.getV2Request(). (Jason Gerlowski) diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index 7b18ffd773d..7f276a91fdf 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -21,6 +21,7 @@ import org.apache.lucene.index.IndexCommit; import org.apache.solr.common.SolrException; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.CoreContainer; @@ -87,7 +88,18 @@ public void startReplication(boolean switchTransactionLog) { NamedList followerConfig = new NamedList<>(); followerConfig.add("fetchFromLeader", Boolean.TRUE); - followerConfig.add(ReplicationHandler.SKIP_COMMIT_ON_LEADER_VERSION_ZERO, switchTransactionLog); + + // don't commit on leader version zero for PULL replicas as PULL should only get its index state from leader + boolean skipCommitOnLeaderVersionZero = switchTransactionLog; + final CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor(); + if (cloudDescriptor != null) { + Replica replica = + cc.getZkController().zkStateReader.getCollection(cloudDescriptor.getCollectionName()) + .getSlice(cloudDescriptor.getShardId()).getReplica(cloudDescriptor.getCoreNodeName()); + skipCommitOnLeaderVersionZero = replica != null && replica.getType() == Replica.Type.PULL; + } + followerConfig.add(ReplicationHandler.SKIP_COMMIT_ON_LEADER_VERSION_ZERO, skipCommitOnLeaderVersionZero); + followerConfig.add("pollInterval", pollIntervalStr); NamedList replicationConfig = new NamedList<>(); replicationConfig.add("follower", followerConfig); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java index 779bae5e38f..49d5f4e05eb 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java @@ -66,7 +66,7 @@ import org.slf4j.LoggerFactory; @Slow -@LogLevel("org.apache.solr.handler.ReplicationHandler=DEBUG,org.apache.solr.handler.IndexFetcher=DEBUG") +@LogLevel("org.apache.solr.handler.ReplicationHandler=DEBUG;org.apache.solr.handler.IndexFetcher=DEBUG") public class TestPullReplica extends SolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -80,7 +80,6 @@ private String suggestedCollectionName() { @BeforeClass public static void createTestCluster() throws Exception { - // cloudSolrClientMaxStaleRetries System.setProperty("cloudSolrClientMaxStaleRetries", "1"); System.setProperty("zkReaderGetLeaderRetryTimeoutMs", "1000"); @@ -122,7 +121,6 @@ public void tearDown() throws Exception { } @Repeat(iterations=2) // 2 times to make sure cleanup is complete and we can create the same collection - // commented out on: 17-Feb-2019 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018 public void testCreateDelete() throws Exception { try { switch (random().nextInt(3)) { @@ -209,7 +207,6 @@ static void assertUlogPresence(DocCollection collection) { } @SuppressWarnings("unchecked") - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") public void testAddDocs() throws Exception { int numPullReplicas = 1 + random().nextInt(3); CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numPullReplicas) @@ -282,13 +279,11 @@ public void testAddRemovePullReplica() throws Exception { } @Test - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") public void testRemoveAllWriterReplicas() throws Exception { doTestNoLeader(true); } @Test - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") public void testKillLeader() throws Exception { doTestNoLeader(false); } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java index d80b9efec0f..cca61991152 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplicaWithAuth.java @@ -41,6 +41,7 @@ import org.apache.solr.security.BasicAuthPlugin; import org.apache.solr.security.RuleBasedAuthorizationPlugin; import org.junit.BeforeClass; +import org.junit.Test; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -84,7 +85,7 @@ private QueryResponse queryWithBasicAuth(HttpSolrClient client, SolrQuery q) thr return withBasicAuth(new QueryRequest(q)).process(client); } - @BadApple(bugUrl = "https://issues.apache.org/jira/browse/SOLR-15399") + @Test public void testPKIAuthWorksForPullReplication() throws Exception { int numPullReplicas = 2; withBasicAuth(CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numPullReplicas)) From 94da20ef3604e70908c505a419666148dea5d8d6 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Wed, 19 May 2021 09:42:23 -0600 Subject: [PATCH 2/5] Fix if logic to only change skipCommitOnLeaderVersionZero for PULL replicas --- .../src/java/org/apache/solr/cloud/ReplicateFromLeader.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index 7f276a91fdf..fd22fe307a2 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -96,7 +96,9 @@ public void startReplication(boolean switchTransactionLog) { Replica replica = cc.getZkController().zkStateReader.getCollection(cloudDescriptor.getCollectionName()) .getSlice(cloudDescriptor.getShardId()).getReplica(cloudDescriptor.getCoreNodeName()); - skipCommitOnLeaderVersionZero = replica != null && replica.getType() == Replica.Type.PULL; + if (replica != null && replica.getType() == Replica.Type.PULL) { + skipCommitOnLeaderVersionZero = true; // only set this to true if we're a PULL replica, otherwise use value of switchTransactionLog + } } followerConfig.add(ReplicationHandler.SKIP_COMMIT_ON_LEADER_VERSION_ZERO, skipCommitOnLeaderVersionZero); From af64b96ece2b81bdad8e1a5d03e6041cc0a21312 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Wed, 19 May 2021 16:33:43 -0600 Subject: [PATCH 3/5] Address review comments and restart a PULL replica before adding docs to verify recovery --- .../solr/cloud/ReplicateFromLeader.java | 16 ++++---- .../apache/solr/cloud/TestPullReplica.java | 40 ++++++++++++++++++- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java index fd22fe307a2..06f67160214 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java @@ -91,13 +91,15 @@ public void startReplication(boolean switchTransactionLog) { // don't commit on leader version zero for PULL replicas as PULL should only get its index state from leader boolean skipCommitOnLeaderVersionZero = switchTransactionLog; - final CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor(); - if (cloudDescriptor != null) { - Replica replica = - cc.getZkController().zkStateReader.getCollection(cloudDescriptor.getCollectionName()) - .getSlice(cloudDescriptor.getShardId()).getReplica(cloudDescriptor.getCoreNodeName()); - if (replica != null && replica.getType() == Replica.Type.PULL) { - skipCommitOnLeaderVersionZero = true; // only set this to true if we're a PULL replica, otherwise use value of switchTransactionLog + if (!skipCommitOnLeaderVersionZero) { + CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor(); + if (cloudDescriptor != null) { + Replica replica = + cc.getZkController().getZkStateReader().getCollection(cloudDescriptor.getCollectionName()) + .getSlice(cloudDescriptor.getShardId()).getReplica(cloudDescriptor.getCoreNodeName()); + if (replica != null && replica.getType() == Replica.Type.PULL) { + skipCommitOnLeaderVersionZero = true; // only set this to true if we're a PULL replica, otherwise use value of switchTransactionLog + } } } followerConfig.add(ReplicationHandler.SKIP_COMMIT_ON_LEADER_VERSION_ZERO, skipCommitOnLeaderVersionZero); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java index 49d5f4e05eb..25a2a310631 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java @@ -211,10 +211,14 @@ public void testAddDocs() throws Exception { int numPullReplicas = 1 + random().nextInt(3); CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numPullReplicas) .process(cluster.getSolrClient()); - waitForState("Expected collection to be created with 1 shard and " + (numPullReplicas + 1) + " replicas", collectionName, clusterShape(1, numPullReplicas + 1)); + waitForState("Expected collection to be created with 1 shard and " + (numPullReplicas + 1) + " replicas", + collectionName, clusterShape(1, numPullReplicas + 1)); DocCollection docCollection = assertNumberOfReplicas(1, 0, numPullReplicas, false, true); assertEquals(1, docCollection.getSlices().size()); + // ugly but needed to ensure a full PULL replication cycle (every sec) has occurred on the replicas before adding docs + Thread.sleep(1500); + boolean reloaded = false; int numDocs = 0; while (true) { @@ -229,7 +233,8 @@ public void testAddDocs() throws Exception { } log.info("Found {} docs in leader, verifying updates make it to {} pull replicas", numDocs, numPullReplicas); - List pullReplicas = s.getReplicas(EnumSet.of(Replica.Type.PULL)); + List pullReplicas = + (numDocs == 1) ? restartPullReplica(docCollection, numPullReplicas) : s.getReplicas(EnumSet.of(Replica.Type.PULL)); waitForNumDocsInAllReplicas(numDocs, pullReplicas); for (Replica r : pullReplicas) { @@ -255,6 +260,37 @@ public void testAddDocs() throws Exception { assertUlogPresence(docCollection); } + private List restartPullReplica(DocCollection docCollection, int numPullReplicas) throws Exception { + Slice s = docCollection.getSlices().iterator().next(); + List pullReplicas = s.getReplicas(EnumSet.of(Replica.Type.PULL)); + // make sure a PULL replica recovers this first doc after a restart + JettySolrRunner leaderJetty = cluster.getReplicaJetty(s.getLeader()); + + // find a node with a PULL replica that's not hosting the leader + JettySolrRunner replicaJetty = null; + for (Replica r : pullReplicas) { + JettySolrRunner jetty = cluster.getReplicaJetty(r); + if (!jetty.getNodeName().equals(leaderJetty.getNodeName())) { + replicaJetty = jetty; + break; + } + } + + // stop / start the node with a pull replica + if (replicaJetty != null) { + replicaJetty.stop(); + cluster.waitForJettyToStop(replicaJetty); + replicaJetty.start(); + waitForState("Expected collection to be created with 1 shard and " + (numPullReplicas + 1) + " replicas", + collectionName, clusterShape(1, numPullReplicas + 1)); + docCollection = assertNumberOfReplicas(1, 0, numPullReplicas, false, true); + s = docCollection.getSlices().iterator().next(); + pullReplicas = s.getReplicas(EnumSet.of(Replica.Type.PULL)); + } // else it's ok if all replicas ended up on the same node, we're not testing replica placement here, but skip this part of the test + + return pullReplicas; + } + public void testAddRemovePullReplica() throws Exception { CollectionAdminRequest.createCollection(collectionName, "conf", 2, 1, 0, 0) .process(cluster.getSolrClient()); From 3402484baa692171c2f5da3119d77169fdcca993 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Wed, 19 May 2021 18:49:54 -0600 Subject: [PATCH 4/5] Improve assert message --- solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java index 25a2a310631..725177c78db 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java @@ -280,8 +280,9 @@ private List restartPullReplica(DocCollection docCollection, int numPul if (replicaJetty != null) { replicaJetty.stop(); cluster.waitForJettyToStop(replicaJetty); + waitForState("Expected to see a downed PULL replica", collectionName, clusterStateReflectsActiveAndDownReplicas()); replicaJetty.start(); - waitForState("Expected collection to be created with 1 shard and " + (numPullReplicas + 1) + " replicas", + waitForState("Expected collection to have recovered with 1 shard and " + (numPullReplicas + 1) + " replicas after restarting " + replicaJetty.getNodeName(), collectionName, clusterShape(1, numPullReplicas + 1)); docCollection = assertNumberOfReplicas(1, 0, numPullReplicas, false, true); s = docCollection.getSlices().iterator().next(); From a3bbc177c4d81f4c7fa1693a5647ae33374bc385 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Thu, 20 May 2021 07:57:33 -0600 Subject: [PATCH 5/5] Remove stray comment after refactor --- solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java index 725177c78db..caf6aa9f875 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestPullReplica.java @@ -263,10 +263,9 @@ public void testAddDocs() throws Exception { private List restartPullReplica(DocCollection docCollection, int numPullReplicas) throws Exception { Slice s = docCollection.getSlices().iterator().next(); List pullReplicas = s.getReplicas(EnumSet.of(Replica.Type.PULL)); - // make sure a PULL replica recovers this first doc after a restart - JettySolrRunner leaderJetty = cluster.getReplicaJetty(s.getLeader()); // find a node with a PULL replica that's not hosting the leader + JettySolrRunner leaderJetty = cluster.getReplicaJetty(s.getLeader()); JettySolrRunner replicaJetty = null; for (Replica r : pullReplicas) { JettySolrRunner jetty = cluster.getReplicaJetty(r);