diff --git a/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 2a615649fcffe..fbc9af1660d8f 100644 --- a/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -60,6 +60,7 @@ import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.index.Index; import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.index.snapshots.IndexShardRestoreFailedException; import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus; import org.elasticsearch.repositories.IndexId; import org.elasticsearch.repositories.RepositoriesService; @@ -70,6 +71,7 @@ import org.elasticsearch.threadpool.ThreadPool; import java.io.IOException; +import java.nio.file.NoSuchFileException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -550,7 +552,8 @@ public List currentSnapshots(final String repository, /** * Returns status of shards currently finished snapshots *

- * This method is executed on master node and it's complimentary to the {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it + * This method is executed on master node and it's complimentary to the + * {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it * returns similar information but for already finished snapshots. *

* @@ -564,6 +567,8 @@ public Map snapshotShards(final String reposi Repository repository = repositoriesService.repository(repositoryName); RepositoryData repositoryData = repository.getRepositoryData(); MetaData metaData = repository.getSnapshotMetaData(snapshotInfo, repositoryData.resolveIndices(snapshotInfo.indices())); + Map nonFailedShards = new HashMap<>(); + boolean hasFailedShards = false; for (String index : snapshotInfo.indices()) { IndexId indexId = repositoryData.resolveIndexId(index); IndexMetaData indexMetaData = metaData.indices().get(index); @@ -577,14 +582,42 @@ public Map snapshotShards(final String reposi shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE); shardSnapshotStatus.failure(shardFailure.reason()); shardStatus.put(shardId, shardSnapshotStatus); + hasFailedShards = true; } else { - IndexShardSnapshotStatus shardSnapshotStatus = - repository.getShardSnapshotStatus(snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId); - shardStatus.put(shardId, shardSnapshotStatus); + nonFailedShards.put(shardId, indexId); } } } } + for (Map.Entry shard : nonFailedShards.entrySet()) { + final ShardId shardId = shard.getKey(); + final IndexId indexId = shard.getValue(); + IndexShardSnapshotStatus shardSnapshotStatus; + try { + shardSnapshotStatus = repository.getShardSnapshotStatus( + snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId); + } catch (IndexShardRestoreFailedException e) { + if (e.getCause() != null + && e.getCause() instanceof NoSuchFileException + && hasFailedShards) { + // the snap-*.dat file could not be found for the index, even + // though there was no shard failure for the index, this can + // happen when a snapshot is *not* set to partial (won't take + // partial snapshots) and has indices that contain missing + // or corrupted shards, in which case the snapshot will not proceed + // for any of the indices, even the ones that have no shard failures + shardSnapshotStatus = new IndexShardSnapshotStatus(); + shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE); + String msg = "snapshot not taken because partial snapshots are not " + + "configured for this snapshot and another shard caused the " + + "snapshot to fail"; + shardSnapshotStatus.failure(msg); + } else { + throw e; + } + } + shardStatus.put(shardId, shardSnapshotStatus); + } return unmodifiableMap(shardStatus); } diff --git a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java index 77a0514a140ba..7c4a261e13177 100644 --- a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java +++ b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java @@ -54,6 +54,7 @@ import org.elasticsearch.cluster.metadata.MappingMetaData; import org.elasticsearch.cluster.metadata.MetaDataIndexStateService; import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; @@ -2737,4 +2738,77 @@ public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception { assertEquals(SnapshotState.SUCCESS, getSnapshotsResponse.getSnapshots().get(0).state()); } + public void testSnapshotStatusOnFailedIndex() throws Exception { + logger.info("--> creating repository"); + final Path repoPath = randomRepoPath(); + final Client client = client(); + assertAcked(client.admin().cluster() + .preparePutRepository("test-repo") + .setType("fs") + .setVerify(false) + .setSettings(Settings.builder().put("location", repoPath))); + + logger.info("--> creating good index"); + assertAcked(prepareCreate("test-idx-good") + .setSettings(Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0))); + ensureGreen(); + final int numDocs = randomIntBetween(1, 5); + for (int i = 0; i < numDocs; i++) { + index("test-idx-good", "doc", Integer.toString(i), "foo", "bar" + i); + } + refresh(); + + logger.info("--> creating bad index"); + assertAcked(prepareCreate("test-idx-bad") + .setWaitForActiveShards(ActiveShardCount.NONE) + .setSettings(Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0) + // set shard allocation to none so the primary cannot be + // allocated - simulates a "bad" index that fails to snapshot + .put(EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), + "none"))); + + logger.info("--> snapshot bad index and get status"); + client.admin().cluster() + .prepareCreateSnapshot("test-repo", "test-snap1") + .setWaitForCompletion(true) + .setIndices("test-idx-bad") + .get(); + SnapshotsStatusResponse snapshotsStatusResponse = client.admin().cluster() + .prepareSnapshotStatus("test-repo") + .setSnapshots("test-snap1") + .get(); + assertEquals(1, snapshotsStatusResponse.getSnapshots().size()); + assertEquals(State.FAILED, snapshotsStatusResponse.getSnapshots().get(0).getState()); + + logger.info("--> snapshot both good and bad index and get status"); + client.admin().cluster() + .prepareCreateSnapshot("test-repo", "test-snap2") + .setWaitForCompletion(true) + .setIndices("test-idx-good", "test-idx-bad") + .get(); + snapshotsStatusResponse = client.admin().cluster() + .prepareSnapshotStatus("test-repo") + .setSnapshots("test-snap2") + .get(); + assertEquals(1, snapshotsStatusResponse.getSnapshots().size()); + // verify a FAILED status is returned instead of a 500 status code + // see https://github.com/elastic/elasticsearch/issues/23716 + SnapshotStatus snapshotStatus = snapshotsStatusResponse.getSnapshots().get(0); + assertEquals(State.FAILED, snapshotStatus.getState()); + for (SnapshotIndexShardStatus shardStatus : snapshotStatus.getShards()) { + assertEquals(SnapshotIndexShardStage.FAILURE, shardStatus.getStage()); + if (shardStatus.getIndex().equals("test-idx-good")) { + String msg = "snapshot not taken because partial snapshots are not " + + "configured for this snapshot and another shard caused the " + + "snapshot to fail"; + assertEquals(msg, shardStatus.getFailure()); + } else { + assertEquals("primary shard is not allocated", shardStatus.getFailure()); + } + } + } }