Skip to content

Commit

Permalink
[Recovery] don't start a gateway recovery if source node is not found
Browse files Browse the repository at this point in the history
Due to change introduced in elastic#6825, we now start a local gateway recovery for replicas, if the source node can not be found. The recovery then fails because we never recover replicas from disk.
  • Loading branch information
bleskes committed Jul 15, 2014
1 parent 7de9d3d commit 1d62ab7
Showing 1 changed file with 13 additions and 7 deletions.
Expand Up @@ -641,10 +641,14 @@ private void applyInitializingShard(final RoutingTable routingTable, final Disco
}
}

// figure out where to recover from (node or disk, in which case sourceNode is null)
// if we're in peer recovery, try to find out the source node now so in case it fails, we will not create the index shard
DiscoveryNode sourceNode = null;
if (isPeerRecovery(shardRouting)) {
sourceNode = findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting);
if (sourceNode == null) {
logger.trace("ignoring initializing shard {} - no source node can be found.", shardRouting.shardId());
return;
}
}

// if there is no shard, create it
Expand Down Expand Up @@ -692,21 +696,23 @@ private void applyInitializingShard(final RoutingTable routingTable, final Disco
return;
}

if (sourceNode != null) {
if (isPeerRecovery(shardRouting)) {
try {

assert sourceNode != null : "peer recovery started but sourceNode is null";

// we don't mark this one as relocated at the end.
// For primaries: requests in any case are routed to both when its relocating and that way we handle
// the edge case where its mark as relocated, and we might need to roll it back...
// For replicas: we are recovering a backup from a primary

RecoveryState.Type type = shardRouting.primary() ? RecoveryState.Type.RELOCATION : RecoveryState.Type.REPLICA;
final Store store = indexShard.store();
final StartRecoveryRequest request;
store.incRef();
try {
store.failIfCorrupted();
request = new StartRecoveryRequest(indexShard.shardId(), sourceNode, nodes.localNode(),
false, store.getMetadata().asMap(), type, recoveryIdGenerator.incrementAndGet());
false, store.getMetadata().asMap(), type, recoveryIdGenerator.incrementAndGet());
} finally {
store.decRef();
}
Expand Down Expand Up @@ -753,20 +759,20 @@ private DiscoveryNode findSourceNodeForPeerRecovery(RoutingTable routingTable, D
// only recover from started primary, if we can't find one, we will do it next round
sourceNode = nodes.get(entry.currentNodeId());
if (sourceNode == null) {
logger.trace("can't find replica source node because primary shard {} is assigned to an unknown node. ignoring.", entry);
logger.trace("can't find replica source node because primary shard {} is assigned to an unknown node.", entry);
return null;
}
break;
}
}

if (sourceNode == null) {
logger.trace("can't find replica source node for {} because a primary shard can not be found. ignoring.", shardRouting.shardId());
logger.trace("can't find replica source node for {} because a primary shard can not be found.", shardRouting.shardId());
}
} else if (shardRouting.relocatingNodeId() != null) {
sourceNode = nodes.get(shardRouting.relocatingNodeId());
if (sourceNode == null) {
logger.trace("can't find relocation source node for shard {} because it is assigned to an unknown node [{}]. ignoring.", shardRouting.shardId(), shardRouting.relocatingNodeId());
logger.trace("can't find relocation source node for shard {} because it is assigned to an unknown node [{}].", shardRouting.shardId(), shardRouting.relocatingNodeId());
}
} else {
throw new ElasticsearchIllegalStateException("trying to find source node for peer recovery when routing state means no peer recovery: " + shardRouting);
Expand Down

0 comments on commit 1d62ab7

Please sign in to comment.