From 1a48dc3310ed2319cf39a3a1f38aa06d5d87f427 Mon Sep 17 00:00:00 2001
From: Ali Beyad <ali@elastic.co>
Date: Thu, 30 Mar 2017 18:21:52 -0400
Subject: [PATCH] Fixes snapshot status on failed snapshots

If a snapshot is taken on multiple indices, and some of them are "good"
indices that don't contain any corruption or failures, and some of them
are "bad" indices that contain missing shards or corrupted shards, and
if the snapshot request is set to partial=false (meaning don't take a
snapshot if there are any failures), then the good indices will not be
snapshotted either.  Previously, when getting the status of such a
snapshot, a 500 error would be thrown, because the snap-*.dat blob for
the shards in the good index could not be found.

This commit fixes the problem by reporting shards of good indices as
failed due to a failed snapshot, instead of throwing the
NoSuchFileException.

Closes #23716
---
 .../snapshots/SnapshotsService.java           | 41 +++++++++-
 .../SharedClusterSnapshotRestoreIT.java       | 74 +++++++++++++++++++
 2 files changed, 111 insertions(+), 4 deletions(-)
diff --git a/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
index 2a615649fcffe..fbc9af1660d8f 100644
--- a/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
+++ b/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@@ -60,6 +60,7 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.index.snapshots.IndexShardRestoreFailedException;
 import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
 import org.elasticsearch.repositories.IndexId;
 import org.elasticsearch.repositories.RepositoriesService;
@@ -70,6 +71,7 @@
 import org.elasticsearch.threadpool.ThreadPool;
 
 import java.io.IOException;
+import java.nio.file.NoSuchFileException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -550,7 +552,8 @@ public List<SnapshotsInProgress.Entry> currentSnapshots(final String repository,
     /**
      * Returns status of shards  currently finished snapshots
      * <p>
-     * This method is executed on master node and it's complimentary to the {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it
+     * This method is executed on master node and it's complimentary to the
+     * {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it
      * returns similar information but for already finished snapshots.
      * </p>
      *
@@ -564,6 +567,8 @@ public Map<ShardId, IndexShardSnapshotStatus> snapshotShards(final String reposi
         Repository repository = repositoriesService.repository(repositoryName);
         RepositoryData repositoryData = repository.getRepositoryData();
         MetaData metaData = repository.getSnapshotMetaData(snapshotInfo, repositoryData.resolveIndices(snapshotInfo.indices()));
+        Map<ShardId, IndexId> nonFailedShards = new HashMap<>();
+        boolean hasFailedShards = false;
         for (String index : snapshotInfo.indices()) {
             IndexId indexId = repositoryData.resolveIndexId(index);
             IndexMetaData indexMetaData = metaData.indices().get(index);
@@ -577,14 +582,42 @@ public Map<ShardId, IndexShardSnapshotStatus> snapshotShards(final String reposi
                         shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE);
                         shardSnapshotStatus.failure(shardFailure.reason());
                         shardStatus.put(shardId, shardSnapshotStatus);
+                        hasFailedShards = true;
                     } else {
-                        IndexShardSnapshotStatus shardSnapshotStatus =
-                            repository.getShardSnapshotStatus(snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId);
-                        shardStatus.put(shardId, shardSnapshotStatus);
+                        nonFailedShards.put(shardId, indexId);
                     }
                 }
             }
         }
+        for (Map.Entry<ShardId, IndexId> shard : nonFailedShards.entrySet()) {
+            final ShardId shardId = shard.getKey();
+            final IndexId indexId = shard.getValue();
+            IndexShardSnapshotStatus shardSnapshotStatus;
+            try {
+                shardSnapshotStatus = repository.getShardSnapshotStatus(
+                    snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId);
+            } catch (IndexShardRestoreFailedException e) {
+                if (e.getCause() != null
+                        && e.getCause() instanceof NoSuchFileException
+                        && hasFailedShards) {
+                    // the snap-*.dat file could not be found for the index, even
+                    // though there was no shard failure for the index, this can
+                    // happen when a snapshot is *not* set to partial (won't take
+                    // partial snapshots) and has indices that contain missing
+                    // or corrupted shards, in which case the snapshot will not proceed
+                    // for any of the indices, even the ones that have no shard failures
+                    shardSnapshotStatus = new IndexShardSnapshotStatus();
+                    shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE);
+                    String msg = "snapshot not taken because partial snapshots are not " +
+                                 "configured for this snapshot and another shard caused the " +
+                                 "snapshot to fail";
+                    shardSnapshotStatus.failure(msg);
+                } else {
+                    throw e;
+                }
+            }
+            shardStatus.put(shardId, shardSnapshotStatus);
+        }
         return unmodifiableMap(shardStatus);
     }
 
diff --git a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java
index 77a0514a140ba..7c4a261e13177 100644
--- a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java
+++ b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java
@@ -54,6 +54,7 @@
 import org.elasticsearch.cluster.metadata.MappingMetaData;
 import org.elasticsearch.cluster.metadata.MetaDataIndexStateService;
 import org.elasticsearch.cluster.routing.IndexRoutingTable;
+import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.Strings;
@@ -2737,4 +2738,77 @@ public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception {
         assertEquals(SnapshotState.SUCCESS, getSnapshotsResponse.getSnapshots().get(0).state());
     }
 
+    public void testSnapshotStatusOnFailedIndex() throws Exception {
+        logger.info("--> creating repository");
+        final Path repoPath = randomRepoPath();
+        final Client client = client();
+        assertAcked(client.admin().cluster()
+            .preparePutRepository("test-repo")
+            .setType("fs")
+            .setVerify(false)
+            .setSettings(Settings.builder().put("location", repoPath)));
+
+        logger.info("--> creating good index");
+        assertAcked(prepareCreate("test-idx-good")
+            .setSettings(Settings.builder()
+                .put(SETTING_NUMBER_OF_SHARDS, 1)
+                .put(SETTING_NUMBER_OF_REPLICAS, 0)));
+        ensureGreen();
+        final int numDocs = randomIntBetween(1, 5);
+        for (int i = 0; i < numDocs; i++) {
+            index("test-idx-good", "doc", Integer.toString(i), "foo", "bar" + i);
+        }
+        refresh();
+
+        logger.info("--> creating bad index");
+        assertAcked(prepareCreate("test-idx-bad")
+            .setWaitForActiveShards(ActiveShardCount.NONE)
+            .setSettings(Settings.builder()
+                .put(SETTING_NUMBER_OF_SHARDS, 1)
+                .put(SETTING_NUMBER_OF_REPLICAS, 0)
+                // set shard allocation to none so the primary cannot be
+                // allocated - simulates a "bad" index that fails to snapshot
+                .put(EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(),
+                     "none")));
+
+        logger.info("--> snapshot bad index and get status");
+        client.admin().cluster()
+            .prepareCreateSnapshot("test-repo", "test-snap1")
+            .setWaitForCompletion(true)
+            .setIndices("test-idx-bad")
+            .get();
+        SnapshotsStatusResponse snapshotsStatusResponse = client.admin().cluster()
+            .prepareSnapshotStatus("test-repo")
+            .setSnapshots("test-snap1")
+            .get();
+        assertEquals(1, snapshotsStatusResponse.getSnapshots().size());
+        assertEquals(State.FAILED, snapshotsStatusResponse.getSnapshots().get(0).getState());
+
+        logger.info("--> snapshot both good and bad index and get status");
+        client.admin().cluster()
+            .prepareCreateSnapshot("test-repo", "test-snap2")
+            .setWaitForCompletion(true)
+            .setIndices("test-idx-good", "test-idx-bad")
+            .get();
+        snapshotsStatusResponse = client.admin().cluster()
+            .prepareSnapshotStatus("test-repo")
+            .setSnapshots("test-snap2")
+            .get();
+        assertEquals(1, snapshotsStatusResponse.getSnapshots().size());
+        // verify a FAILED status is returned instead of a 500 status code
+        // see https://github.com/elastic/elasticsearch/issues/23716
+        SnapshotStatus snapshotStatus = snapshotsStatusResponse.getSnapshots().get(0);
+        assertEquals(State.FAILED, snapshotStatus.getState());
+        for (SnapshotIndexShardStatus shardStatus : snapshotStatus.getShards()) {
+            assertEquals(SnapshotIndexShardStage.FAILURE, shardStatus.getStage());
+            if (shardStatus.getIndex().equals("test-idx-good")) {
+                String msg = "snapshot not taken because partial snapshots are not " +
+                             "configured for this snapshot and another shard caused the " +
+                             "snapshot to fail";
+                assertEquals(msg, shardStatus.getFailure());
+            } else {
+                assertEquals("primary shard is not allocated", shardStatus.getFailure());
+            }
+        }
+    }
 }