Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SOLR-16416: Retry overseerPrioritizer ops on failure #1129

Merged
merged 8 commits into from Oct 26, 2022
2 changes: 2 additions & 0 deletions solr/CHANGES.txt
Expand Up @@ -262,6 +262,8 @@ Bug Fixes

* SOLR-16485: Fix NPE in ShardHandlerFactory when running in Standalone mode (Houston Putman)

* SOLR-16416: OverseerPrioritizer now runs after all handlers are registered, and retries on failures. (Houston Putman)

Other Changes
---------------------
* SOLR-16351: Upgrade Carrot2 to 4.4.3, upgrade randomizedtesting to 2.8.0. (Dawid Weiss)
Expand Down
Expand Up @@ -21,6 +21,7 @@
import java.util.List;
import java.util.Map;
import org.apache.solr.client.solrj.impl.ZkDistribStateManager;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CoreAdminParams;
Expand Down Expand Up @@ -117,11 +118,13 @@ public synchronized void prioritizeOverseerNodes(String overseerId) throws Excep
}
if (!designateNodeId.equals(electionNodes.get(1))) { // checking if it is already at no:1
log.info("asking node {} to come join election at head", designateNodeId);
invokeOverseerOp(designateNodeId, "rejoinAtHead"); // ask designate to come first
invokeOverseerOpWithRetries(
designateNodeId, "rejoinAtHead", 5); // ask designate to come first
if (log.isInfoEnabled()) {
log.info("asking the old first in line {} to rejoin election ", electionNodes.get(1));
}
invokeOverseerOp(electionNodes.get(1), "rejoin"); // ask second inline to go behind
invokeOverseerOpWithRetries(
electionNodes.get(1), "rejoin", 5); // ask second inline to go behind
if (log.isInfoEnabled()) {
List<String> newElectionNodes =
OverseerTaskProcessor.getSortedElectionNodes(
Expand All @@ -133,6 +136,22 @@ public synchronized void prioritizeOverseerNodes(String overseerId) throws Excep
overseer.sendQuitToOverseer(OverseerTaskProcessor.getLeaderId(zkStateReader.getZkClient()));
}

private void invokeOverseerOpWithRetries(String electionNode, String op, int retryCount) {
boolean successful = false;
for (int i = 0; i < retryCount && !successful; i++) {
try {
invokeOverseerOp(electionNode, op);
successful = true;
} catch (SolrException e) {
if (i < retryCount - 1) {
HoustonPutman marked this conversation as resolved.
Show resolved Hide resolved
log.warn("Exception occurred while invoking Overseer Operation '{}'. Retrying.", op, e);
} else {
throw e;
}
}
}
}

private void invokeOverseerOp(String electionNode, String op) {
ModifiableSolrParams params = new ModifiableSolrParams();
ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
Expand All @@ -149,8 +168,10 @@ private void invokeOverseerOp(String electionNode, String op) {
shardHandler.submit(sreq, replica, sreq.params);
ShardResponse response = shardHandler.takeCompletedOrError();
if (response.getException() != null) {
log.error(
"Exception occurred while invoking Overseer Operation: {}", op, response.getException());
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"Exception occurred while invoking Overseer Operation: " + op,
response.getException());
}
}
}
29 changes: 16 additions & 13 deletions solr/core/src/java/org/apache/solr/core/CoreContainer.java
Expand Up @@ -1055,19 +1055,6 @@ public void load() {
clusterSingletons.getSingletons().put(singleton.getName(), singleton);
}
});

clusterSingletons.setReady();
if (NodeRoles.MODE_PREFERRED.equals(nodeRoles.getRoleMode(NodeRoles.Role.OVERSEER))) {
try {
log.info("This node has been started as a preferred overseer");
zkSys.getZkController().setPreferredOverseer();
} catch (KeeperException | InterruptedException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
if (!distributedCollectionCommandRunner.isPresent()) {
zkSys.getZkController().checkOverseerDesignate();
}
}

final CoreContainer thisCCRef = this;
Expand All @@ -1085,6 +1072,22 @@ protected void configure() {
});
jerseyAppHandler = new ApplicationHandler(containerHandlers.getJerseyEndpoints());

// Do Node setup logic after all handlers have been registered.
if (isZooKeeperAware()) {
clusterSingletons.setReady();
if (NodeRoles.MODE_PREFERRED.equals(nodeRoles.getRoleMode(NodeRoles.Role.OVERSEER))) {
try {
log.info("This node has been started as a preferred overseer");
zkSys.getZkController().setPreferredOverseer();
} catch (KeeperException | InterruptedException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
if (!distributedCollectionCommandRunner.isPresent()) {
zkSys.getZkController().checkOverseerDesignate();
}
}

// This is a bit redundant but these are two distinct concepts for all they're accomplished at
// the same time.
status |= LOAD_COMPLETE | INITIAL_CORE_LOAD_COMPLETE;
Expand Down