From 1a48dc3310ed2319cf39a3a1f38aa06d5d87f427 Mon Sep 17 00:00:00 2001 From: Ali Beyad Date: Thu, 30 Mar 2017 18:21:52 -0400 Subject: [PATCH] Fixes snapshot status on failed snapshots If a snapshot is taken on multiple indices, and some of them are "good" indices that don't contain any corruption or failures, and some of them are "bad" indices that contain missing shards or corrupted shards, and if the snapshot request is set to partial=false (meaning don't take a snapshot if there are any failures), then the good indices will not be snapshotted either. Previously, when getting the status of such a snapshot, a 500 error would be thrown, because the snap-*.dat blob for the shards in the good index could not be found. This commit fixes the problem by reporting shards of good indices as failed due to a failed snapshot, instead of throwing the NoSuchFileException. Closes #23716 --- .../snapshots/SnapshotsService.java | 41 +++++++++- .../SharedClusterSnapshotRestoreIT.java | 74 +++++++++++++++++++ 2 files changed, 111 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 2a615649fcffe..fbc9af1660d8f 100644 --- a/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -60,6 +60,7 @@ import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.index.Index; import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.index.snapshots.IndexShardRestoreFailedException; import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus; import org.elasticsearch.repositories.IndexId; import org.elasticsearch.repositories.RepositoriesService; @@ -70,6 +71,7 @@ import org.elasticsearch.threadpool.ThreadPool; import java.io.IOException; +import java.nio.file.NoSuchFileException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -550,7 +552,8 @@ public List currentSnapshots(final String repository, /** * Returns status of shards currently finished snapshots *

- * This method is executed on master node and it's complimentary to the {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it + * This method is executed on master node and it's complimentary to the + * {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it * returns similar information but for already finished snapshots. *

* @@ -564,6 +567,8 @@ public Map snapshotShards(final String reposi Repository repository = repositoriesService.repository(repositoryName); RepositoryData repositoryData = repository.getRepositoryData(); MetaData metaData = repository.getSnapshotMetaData(snapshotInfo, repositoryData.resolveIndices(snapshotInfo.indices())); + Map nonFailedShards = new HashMap<>(); + boolean hasFailedShards = false; for (String index : snapshotInfo.indices()) { IndexId indexId = repositoryData.resolveIndexId(index); IndexMetaData indexMetaData = metaData.indices().get(index); @@ -577,14 +582,42 @@ public Map snapshotShards(final String reposi shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE); shardSnapshotStatus.failure(shardFailure.reason()); shardStatus.put(shardId, shardSnapshotStatus); + hasFailedShards = true; } else { - IndexShardSnapshotStatus shardSnapshotStatus = - repository.getShardSnapshotStatus(snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId); - shardStatus.put(shardId, shardSnapshotStatus); + nonFailedShards.put(shardId, indexId); } } } } + for (Map.Entry shard : nonFailedShards.entrySet()) { + final ShardId shardId = shard.getKey(); + final IndexId indexId = shard.getValue(); + IndexShardSnapshotStatus shardSnapshotStatus; + try { + shardSnapshotStatus = repository.getShardSnapshotStatus( + snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId); + } catch (IndexShardRestoreFailedException e) { + if (e.getCause() != null + && e.getCause() instanceof NoSuchFileException + && hasFailedShards) { + // the snap-*.dat file could not be found for the index, even + // though there was no shard failure for the index, this can + // happen when a snapshot is *not* set to partial (won't take + // partial snapshots) and has indices that contain missing + // or corrupted shards, in which case the snapshot will not proceed + // for any of the indices, even the ones that have no shard failures + shardSnapshotStatus = new IndexShardSnapshotStatus(); + shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE); + String msg = "snapshot not taken because partial snapshots are not " + + "configured for this snapshot and another shard caused the " + + "snapshot to fail"; + shardSnapshotStatus.failure(msg); + } else { + throw e; + } + } + shardStatus.put(shardId, shardSnapshotStatus); + } return unmodifiableMap(shardStatus); } diff --git a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java index 77a0514a140ba..7c4a261e13177 100644 --- a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java +++ b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java @@ -54,6 +54,7 @@ import org.elasticsearch.cluster.metadata.MappingMetaData; import org.elasticsearch.cluster.metadata.MetaDataIndexStateService; import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; @@ -2737,4 +2738,77 @@ public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception { assertEquals(SnapshotState.SUCCESS, getSnapshotsResponse.getSnapshots().get(0).state()); } + public void testSnapshotStatusOnFailedIndex() throws Exception { + logger.info("--> creating repository"); + final Path repoPath = randomRepoPath(); + final Client client = client(); + assertAcked(client.admin().cluster() + .preparePutRepository("test-repo") + .setType("fs") + .setVerify(false) + .setSettings(Settings.builder().put("location", repoPath))); + + logger.info("--> creating good index"); + assertAcked(prepareCreate("test-idx-good") + .setSettings(Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0))); + ensureGreen(); + final int numDocs = randomIntBetween(1, 5); + for (int i = 0; i < numDocs; i++) { + index("test-idx-good", "doc", Integer.toString(i), "foo", "bar" + i); + } + refresh(); + + logger.info("--> creating bad index"); + assertAcked(prepareCreate("test-idx-bad") + .setWaitForActiveShards(ActiveShardCount.NONE) + .setSettings(Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0) + // set shard allocation to none so the primary cannot be + // allocated - simulates a "bad" index that fails to snapshot + .put(EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), + "none"))); + + logger.info("--> snapshot bad index and get status"); + client.admin().cluster() + .prepareCreateSnapshot("test-repo", "test-snap1") + .setWaitForCompletion(true) + .setIndices("test-idx-bad") + .get(); + SnapshotsStatusResponse snapshotsStatusResponse = client.admin().cluster() + .prepareSnapshotStatus("test-repo") + .setSnapshots("test-snap1") + .get(); + assertEquals(1, snapshotsStatusResponse.getSnapshots().size()); + assertEquals(State.FAILED, snapshotsStatusResponse.getSnapshots().get(0).getState()); + + logger.info("--> snapshot both good and bad index and get status"); + client.admin().cluster() + .prepareCreateSnapshot("test-repo", "test-snap2") + .setWaitForCompletion(true) + .setIndices("test-idx-good", "test-idx-bad") + .get(); + snapshotsStatusResponse = client.admin().cluster() + .prepareSnapshotStatus("test-repo") + .setSnapshots("test-snap2") + .get(); + assertEquals(1, snapshotsStatusResponse.getSnapshots().size()); + // verify a FAILED status is returned instead of a 500 status code + // see https://github.com/elastic/elasticsearch/issues/23716 + SnapshotStatus snapshotStatus = snapshotsStatusResponse.getSnapshots().get(0); + assertEquals(State.FAILED, snapshotStatus.getState()); + for (SnapshotIndexShardStatus shardStatus : snapshotStatus.getShards()) { + assertEquals(SnapshotIndexShardStage.FAILURE, shardStatus.getStage()); + if (shardStatus.getIndex().equals("test-idx-good")) { + String msg = "snapshot not taken because partial snapshots are not " + + "configured for this snapshot and another shard caused the " + + "snapshot to fail"; + assertEquals(msg, shardStatus.getFailure()); + } else { + assertEquals("primary shard is not allocated", shardStatus.getFailure()); + } + } + } }