From 16558ee0a1d57eba00c93190ec36eaa7186757df Mon Sep 17 00:00:00 2001 From: Mano Kovacs Date: Tue, 28 Aug 2018 19:39:38 +0200 Subject: [PATCH] SOLR-12708: Aggregate failures from downstream async jobs; add error handling for RestoreCmd --- solr/CHANGES.txt | 4 +- .../OverseerCollectionMessageHandler.java | 33 ++++++---- .../cloud/api/collections/RestoreCmd.java | 41 +++++++++++- .../solr/handler/admin/CoreAdminHandler.java | 3 +- .../cloud-minimal-faulty/conf/schema.xml | 29 ++++++++ .../cloud-minimal-faulty/conf/solrconfig.xml | 50 ++++++++++++++ .../AbstractCloudBackupRestoreTestCase.java | 66 ++++++++++++++++++- .../TestHdfsCloudBackupRestore.java | 2 +- .../TestLocalFSCloudBackupRestore.java | 3 +- 9 files changed, 212 insertions(+), 19 deletions(-) create mode 100644 solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/schema.xml create mode 100644 solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/solrconfig.xml diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 6128986d48b3..03c1a210242a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -71,6 +71,8 @@ Other Changes java.time.DateTimeFormatter instead of Joda time (see upgrade notes). "Lenient" is enabled. Removed Joda Time dependency. (David Smiley, Bar Rotstein) +* SOLR-12708: Fix async collection admin operations to report downstream failures (Mano Kovacs) + ================== 7.6.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. @@ -462,7 +464,7 @@ Other Changes * SOLR-8742: In HdfsDirectoryTest replace RAMDirectory usages with ByteBuffersDirectory. (hossman, Mark Miller, Andrzej Bialecki, Steve Rowe) - + * SOLR-12771: Improve Autoscaling Policy and Preferences documentation. (hossman, Steve Rowe) ================== 7.4.0 ================== diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java index a724bc78f19d..065e7df9210a 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java @@ -156,6 +156,8 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler, COLOCATED_WITH, null)); private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + public static final String FAILURE_FIELD = "failure"; + public static final String SUCCESS_FIELD = "success"; Overseer overseer; ShardHandlerFactory shardHandlerFactory; @@ -837,6 +839,15 @@ private void processResponse(NamedList results, ShardResponse srsp, Set processResponse(results, e, nodeName, solrResponse, shard, okayExceptions); } + private SimpleOrderedMap getOrCreateMap(NamedList results, String key) { + SimpleOrderedMap map = (SimpleOrderedMap) results.get(key); + if (map == null) { + map = new SimpleOrderedMap(); + results.add(key, map); + } + return map; + } + @SuppressWarnings("unchecked") private void processResponse(NamedList results, Throwable e, String nodeName, SolrResponse solrResponse, String shard, Set okayExceptions) { String rootThrowable = null; @@ -847,21 +858,13 @@ private void processResponse(NamedList results, Throwable e, String nodeName, So if (e != null && (rootThrowable == null || !okayExceptions.contains(rootThrowable))) { log.error("Error from shard: " + shard, e); - SimpleOrderedMap failure = (SimpleOrderedMap) results.get("failure"); - if (failure == null) { - failure = new SimpleOrderedMap(); - results.add("failure", failure); - } + SimpleOrderedMap failure = getOrCreateMap(results, FAILURE_FIELD); failure.add(nodeName, e.getClass().getName() + ":" + e.getMessage()); } else { - SimpleOrderedMap success = (SimpleOrderedMap) results.get("success"); - if (success == null) { - success = new SimpleOrderedMap(); - results.add("success", success); - } + SimpleOrderedMap success = getOrCreateMap(results, SUCCESS_FIELD); success.add(nodeName, solrResponse.getResponse()); } @@ -871,7 +874,15 @@ private void processResponse(NamedList results, Throwable e, String nodeName, So private void waitForAsyncCallsToComplete(Map requestMap, NamedList results) { for (String k:requestMap.keySet()) { log.debug("I am Waiting for :{}/{}", k, requestMap.get(k)); - results.add(requestMap.get(k), waitForCoreAdminAsyncCallToComplete(k, requestMap.get(k))); + NamedList response = waitForCoreAdminAsyncCallToComplete(k, requestMap.get(k)); + results.add(requestMap.get(k), response); // backward compatibility reasons + String msg = (String)response.get("Response"); + if ("failed".equalsIgnoreCase(((String)response.get("STATUS")))) { + log.error("Error from shard " + k + ": " + msg); + getOrCreateMap(results, FAILURE_FIELD).add(k, msg); + } else { + getOrCreateMap(results, SUCCESS_FIELD).add(k, msg); + } } } diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java index d082ac337c5f..68b6e77f11c6 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java @@ -32,6 +32,9 @@ import java.util.Optional; import java.util.Properties; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import org.apache.solr.client.solrj.cloud.DistributedQueue; import org.apache.solr.client.solrj.cloud.autoscaling.PolicyHelper; @@ -51,6 +54,7 @@ import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.Utils; import org.apache.solr.core.CoreContainer; @@ -238,9 +242,12 @@ public void call(ClusterState state, ZkNodeProps message, NamedList results) thr message, sliceNames, numNrtReplicas, numTlogReplicas, numPullReplicas); sessionWrapper = PolicyHelper.getLastSessionWrapper(true); + + CountDownLatch countDownLatch = new CountDownLatch(restoreCollection.getSlices().size()); + //Create one replica per shard and copy backed up data to it for (Slice slice : restoreCollection.getSlices()) { - log.debug("Adding replica for shard={} collection={} ", slice.getName(), restoreCollection); + log.info("Adding replica for shard={} collection={} ", slice.getName(), restoreCollection); HashMap propMap = new HashMap<>(); propMap.put(Overseer.QUEUE_OPERATION, CREATESHARD); propMap.put(COLLECTION_PROP, restoreCollectionName); @@ -271,7 +278,37 @@ public void call(ClusterState state, ZkNodeProps message, NamedList results) thr propMap.put(ASYNC, asyncId); } ocmh.addPropertyParams(message, propMap); - ocmh.addReplica(clusterState, new ZkNodeProps(propMap), new NamedList(), null); + final NamedList addResult = new NamedList(); + ocmh.addReplica(clusterState, new ZkNodeProps(propMap), addResult, () -> { + countDownLatch.countDown(); + Object addResultFailure = addResult.get("failure"); + if (addResultFailure != null) { + SimpleOrderedMap failure = (SimpleOrderedMap) results.get("failure"); + if (failure == null) { + failure = new SimpleOrderedMap(); + results.add("failure", failure); + } + failure.addAll((NamedList) addResultFailure); + } else { + SimpleOrderedMap success = (SimpleOrderedMap) results.get("success"); + if (success == null) { + success = new SimpleOrderedMap(); + results.add("success", success); + } + success.addAll((NamedList) addResult.get("success")); + } + }); + } + + boolean allIsDone = countDownLatch.await(1, TimeUnit.HOURS); + if (!allIsDone) { + throw new TimeoutException("Initial replicas were not created within 10 minutes. Timing out."); + } + Object failures = results.get("failure"); + if (failures != null && ((SimpleOrderedMap) failures).size() > 0) { + log.error("Restore failed to create initial replicas."); + ocmh.cleanupCollection(restoreCollectionName, new NamedList()); + return; } //refresh the location copy of collection state diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 66dc39e57e8a..e3fef254bf74 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -194,8 +194,9 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw removeTask("running", taskObject.taskId); if (exceptionCaught) { addTask("failed", taskObject, true); - } else + } else { addTask("completed", taskObject, true); + } } }); } finally { diff --git a/solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/schema.xml b/solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/schema.xml new file mode 100644 index 000000000000..4124feab0c3d --- /dev/null +++ b/solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/schema.xml @@ -0,0 +1,29 @@ + + + + + + + + + + + + + id + diff --git a/solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/solrconfig.xml b/solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/solrconfig.xml new file mode 100644 index 000000000000..c486043e9e8e --- /dev/null +++ b/solr/core/src/test-files/solr/configsets/cloud-minimal-faulty/conf/solrconfig.xml @@ -0,0 +1,50 @@ + + + + + + + + + ${solr.data.dir:} + + + + + ${tests.luceneMatchVersion:LATEST} + + + ${solr.commitwithin.softcommit:true} + + + + + + + explicit + true + text + + + + + + + + diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java index 17089b86aa04..430e35cb1680 100644 --- a/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java @@ -57,12 +57,14 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa protected static final int NUM_SHARDS = 2;//granted we sometimes shard split to get more protected static final int NUM_SPLIT_SHARDS = 3; //We always split shard1 so total shards post split will be 3 + protected static final String BACKUPNAME_PREFIX = "mytestbackup"; int replFactor; int numTlogReplicas; int numPullReplicas; private static long docsSeed; // see indexDocs() + private String testSuffix = "test1"; @BeforeClass public static void createCluster() throws Exception { @@ -72,7 +74,7 @@ public static void createCluster() throws Exception { /** * @return The name of the collection to use. */ - public abstract String getCollectionName(); + public abstract String getCollectionNamePrefix(); /** * @return The name of the backup repository to use. @@ -85,8 +87,18 @@ public static void createCluster() throws Exception { */ public abstract String getBackupLocation(); + + public String getCollectionName(){ + return getCollectionNamePrefix() + "_" + testSuffix; + } + + public void setTestSuffix(String testSuffix) { + this.testSuffix = testSuffix; + } + @Test public void test() throws Exception { + setTestSuffix("testok"); boolean isImplicit = random().nextBoolean(); boolean doSplitShardOperation = !isImplicit && random().nextBoolean(); replFactor = TestUtil.nextInt(random(), 1, 2); @@ -146,6 +158,56 @@ public void test() throws Exception { testInvalidPath(getCollectionName()); } + @Test + public void testRestoreFailure() throws Exception { + setTestSuffix("testfailure"); + replFactor = TestUtil.nextInt(random(), 1, 2); + numTlogReplicas = TestUtil.nextInt(random(), 0, 1); + numPullReplicas = TestUtil.nextInt(random(), 0, 1); + + CollectionAdminRequest.Create create = + CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor, numTlogReplicas, numPullReplicas); + + if (NUM_SHARDS * (replFactor + numTlogReplicas + numPullReplicas) > cluster.getJettySolrRunners().size()) { + create.setMaxShardsPerNode((int)Math.ceil(NUM_SHARDS * (replFactor + numTlogReplicas + numPullReplicas) / cluster.getJettySolrRunners().size())); //just to assert it survives the restoration + } + + CloudSolrClient solrClient = cluster.getSolrClient(); + create.process(solrClient); + + indexDocs(getCollectionName(), false); + + + String backupLocation = getBackupLocation(); + String backupName = BACKUPNAME_PREFIX + testSuffix; + + DocCollection backupCollection = solrClient.getZkStateReader().getClusterState().getCollection(getCollectionName()); + + log.info("Triggering Backup command"); + + { + CollectionAdminRequest.Backup backup = CollectionAdminRequest.backupCollection(getCollectionName(), backupName) + .setLocation(backupLocation).setRepositoryName(getBackupRepoName()); + assertEquals(0, backup.process(solrClient).getStatus()); + } + + log.info("Triggering Restore command"); + + String restoreCollectionName = getCollectionName() + "_restored"; + + { + CollectionAdminRequest.Restore restore = CollectionAdminRequest.restoreCollection(restoreCollectionName, backupName) + .setLocation(backupLocation).setRepositoryName(getBackupRepoName()); + if (backupCollection.getReplicas().size() > cluster.getJettySolrRunners().size()) { + // may need to increase maxShardsPerNode (e.g. if it was shard split, then now we need more) + restore.setMaxShardsPerNode((int)Math.ceil(backupCollection.getReplicas().size()/cluster.getJettySolrRunners().size())); + } + + restore.setConfigName("confFaulty"); + assertEquals(RequestStatusState.FAILED, restore.processAndWait(solrClient, 30)); + } + } + /** * This test validates the backup of collection configuration using * {@linkplain CollectionAdminParams#NO_INDEX_BACKUP_STRATEGY}. @@ -226,7 +288,7 @@ private int indexDocs(String collectionName, boolean useUUID) throws Exception { private void testBackupAndRestore(String collectionName, int backupReplFactor) throws Exception { String backupLocation = getBackupLocation(); - String backupName = "mytestbackup"; + String backupName = BACKUPNAME_PREFIX + testSuffix; CloudSolrClient client = cluster.getSolrClient(); DocCollection backupCollection = client.getZkStateReader().getClusterState().getCollection(collectionName); diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java index e81bc4bbb6c6..339648973f66 100644 --- a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java +++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestHdfsCloudBackupRestore.java @@ -153,7 +153,7 @@ public static void teardownClass() throws Exception { } @Override - public String getCollectionName() { + public String getCollectionNamePrefix() { return "hdfsbackuprestore"; } diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java index 34c6e3d80729..608d63596614 100644 --- a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java +++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestLocalFSCloudBackupRestore.java @@ -32,6 +32,7 @@ public class TestLocalFSCloudBackupRestore extends AbstractCloudBackupRestoreTes public static void setupClass() throws Exception { configureCluster(NUM_SHARDS)// nodes .addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf")) + .addConfig("confFaulty", TEST_PATH().resolve("configsets").resolve("cloud-minimal-faulty").resolve("conf")) .configure(); boolean whitespacesInPath = random().nextBoolean(); @@ -43,7 +44,7 @@ public static void setupClass() throws Exception { } @Override - public String getCollectionName() { + public String getCollectionNamePrefix() { return "backuprestore"; }