Skip to content

Commit

Permalink
YARN-3385. Fixed a race-condition in ResourceManager's ZooKeeper base…
Browse files Browse the repository at this point in the history
…d state-store to avoid crashing on duplicate deletes. Contributed by Zhihai Xu.
  • Loading branch information
vinoduec committed May 7, 2015
1 parent 31b627b commit 4c7b9b6
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 13 deletions.
3 changes: 3 additions & 0 deletions hadoop-yarn-project/CHANGES.txt
Expand Up @@ -373,6 +373,9 @@ Release 2.7.1 - UNRELEASED
YARN-3301. Fixed the format issue of the new RM attempt web page.
(Xuan Gong via jianhe)

YARN-3385. Fixed a race-condition in ResourceManager's ZooKeeper based
state-store to avoid crashing on duplicate deletes. (Zhihai Xu via vinodkv)

Release 2.7.0 - 2015-04-20

INCOMPATIBLE CHANGES
Expand Down
Expand Up @@ -694,7 +694,7 @@ public synchronized void removeApplicationStateInternal(
LOG.debug("Removing info for app: " + appId + " at: " + appIdRemovePath
+ " and its attempts.");
}
doMultiWithRetries(opList);
doDeleteMultiWithRetries(opList);
}

@Override
Expand All @@ -703,13 +703,12 @@ protected synchronized void storeRMDelegationTokenState(
throws Exception {
ArrayList<Op> opList = new ArrayList<Op>();
addStoreOrUpdateOps(opList, rmDTIdentifier, renewDate, false);
doMultiWithRetries(opList);
doStoreMultiWithRetries(opList);
}

@Override
protected synchronized void removeRMDelegationTokenState(
RMDelegationTokenIdentifier rmDTIdentifier) throws Exception {
ArrayList<Op> opList = new ArrayList<Op>();
String nodeRemovePath =
getNodePath(delegationTokensRootPath, DELEGATION_TOKEN_PREFIX
+ rmDTIdentifier.getSequenceNumber());
Expand All @@ -718,11 +717,12 @@ protected synchronized void removeRMDelegationTokenState(
+ rmDTIdentifier.getSequenceNumber());
}
if (existsWithRetries(nodeRemovePath, false) != null) {
ArrayList<Op> opList = new ArrayList<Op>();
opList.add(Op.delete(nodeRemovePath, -1));
doDeleteMultiWithRetries(opList);
} else {
LOG.debug("Attempted to delete a non-existing znode " + nodeRemovePath);
}
doMultiWithRetries(opList);
}

@Override
Expand All @@ -741,7 +741,7 @@ protected synchronized void updateRMDelegationTokenState(
// in case znode exists
addStoreOrUpdateOps(opList, rmDTIdentifier, renewDate, true);
}
doMultiWithRetries(opList);
doStoreMultiWithRetries(opList);
}

private void addStoreOrUpdateOps(ArrayList<Op> opList,
Expand Down Expand Up @@ -810,7 +810,7 @@ protected synchronized void removeRMDTMasterKeyState(
LOG.debug("Removing RMDelegationKey_" + delegationKey.getKeyId());
}
if (existsWithRetries(nodeRemovePath, false) != null) {
doMultiWithRetries(Op.delete(nodeRemovePath, -1));
doDeleteMultiWithRetries(Op.delete(nodeRemovePath, -1));
} else {
LOG.debug("Attempted to delete a non-existing znode " + nodeRemovePath);
}
Expand Down Expand Up @@ -914,7 +914,7 @@ String getNodePath(String root, String nodeName) {
* Helper method that creates fencing node, executes the passed operations,
* and deletes the fencing node.
*/
private synchronized void doMultiWithRetries(
private synchronized void doStoreMultiWithRetries(
final List<Op> opList) throws Exception {
final List<Op> execOpList = new ArrayList<Op>(opList.size() + 2);
execOpList.add(createFencingNodePathOp);
Expand All @@ -933,8 +933,32 @@ public Void run() throws KeeperException, InterruptedException {
* Helper method that creates fencing node, executes the passed operation,
* and deletes the fencing node.
*/
private void doMultiWithRetries(final Op op) throws Exception {
doMultiWithRetries(Collections.singletonList(op));
private void doStoreMultiWithRetries(final Op op) throws Exception {
doStoreMultiWithRetries(Collections.singletonList(op));
}

/**
* Helper method that creates fencing node, executes the passed
* delete related operations and deletes the fencing node.
*/
private synchronized void doDeleteMultiWithRetries(
final List<Op> opList) throws Exception {
final List<Op> execOpList = new ArrayList<Op>(opList.size() + 2);
execOpList.add(createFencingNodePathOp);
execOpList.addAll(opList);
execOpList.add(deleteFencingNodePathOp);
new ZKAction<Void>() {
@Override
public Void run() throws KeeperException, InterruptedException {
setHasDeleteNodeOp(true);
zkClient.multi(execOpList);
return null;
}
}.runWithRetries();
}

private void doDeleteMultiWithRetries(final Op op) throws Exception {
doDeleteMultiWithRetries(Collections.singletonList(op));
}

@VisibleForTesting
Expand All @@ -943,15 +967,15 @@ private void doMultiWithRetries(final Op op) throws Exception {
public void createWithRetries(
final String path, final byte[] data, final List<ACL> acl,
final CreateMode mode) throws Exception {
doMultiWithRetries(Op.create(path, data, acl, mode));
doStoreMultiWithRetries(Op.create(path, data, acl, mode));
}

@VisibleForTesting
@Private
@Unstable
public void setDataWithRetries(final String path, final byte[] data,
final int version) throws Exception {
doMultiWithRetries(Op.setData(path, data, version));
doStoreMultiWithRetries(Op.setData(path, data, version));
}

@VisibleForTesting
Expand Down Expand Up @@ -1017,7 +1041,12 @@ private void recursiveDeleteWithRetriesHelper(String path, boolean watch)
for (String child : children) {
recursiveDeleteWithRetriesHelper(path + "/" + child, false);
}
zkClient.delete(path, -1);

try {
zkClient.delete(path, -1);
} catch (KeeperException.NoNodeException nne) {
LOG.info("Node " + path + " doesn't exist to delete");
}
}

/**
Expand All @@ -1037,7 +1066,7 @@ public void run() {
if(isFencedState()) {
break;
}
doMultiWithRetries(emptyOpList);
doStoreMultiWithRetries(emptyOpList);
Thread.sleep(zkSessionTimeout);
}
} catch (InterruptedException ie) {
Expand All @@ -1050,6 +1079,10 @@ public void run() {
}

private abstract class ZKAction<T> {
private boolean hasDeleteNodeOp = false;
void setHasDeleteNodeOp(boolean hasDeleteOp) {
this.hasDeleteNodeOp = hasDeleteOp;
}
// run() expects synchronization on ZKRMStateStore.this
abstract T run() throws KeeperException, InterruptedException;

Expand Down Expand Up @@ -1099,6 +1132,11 @@ T runWithRetries() throws Exception {
LOG.info("znode already exists!");
return null;
}
if (hasDeleteNodeOp && ke.code() == Code.NONODE) {
LOG.info("znode has already been deleted!");
return null;
}

LOG.info("Exception while executing a ZK operation.", ke);
if (shouldRetry(ke.code()) && ++retry < numRetries) {
LOG.info("Retrying operation on ZK. Retry no. " + retry);
Expand Down
Expand Up @@ -25,6 +25,7 @@
import static org.mockito.Mockito.when;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;

import javax.crypto.SecretKey;
Expand All @@ -38,6 +39,7 @@
import org.apache.hadoop.security.token.delegation.DelegationKey;
import org.apache.hadoop.service.Service;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
Expand All @@ -58,6 +60,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.data.Stat;
import org.junit.Assert;
Expand Down Expand Up @@ -381,4 +384,36 @@ public void testFencedState() throws Exception {

store.close();
}

@Test
public void testDuplicateRMAppDeletion() throws Exception {
TestZKRMStateStoreTester zkTester = new TestZKRMStateStoreTester();
long submitTime = System.currentTimeMillis();
long startTime = System.currentTimeMillis() + 1234;
RMStateStore store = zkTester.getRMStateStore();
TestDispatcher dispatcher = new TestDispatcher();
store.setRMDispatcher(dispatcher);

ApplicationAttemptId attemptIdRemoved = ConverterUtils
.toApplicationAttemptId("appattempt_1352994193343_0002_000001");
ApplicationId appIdRemoved = attemptIdRemoved.getApplicationId();
storeApp(store, appIdRemoved, submitTime, startTime);
storeAttempt(store, attemptIdRemoved,
"container_1352994193343_0002_01_000001", null, null, dispatcher);

ApplicationSubmissionContext context =
new ApplicationSubmissionContextPBImpl();
context.setApplicationId(appIdRemoved);
ApplicationStateData appStateRemoved =
ApplicationStateData.newInstance(
submitTime, startTime, context, "user1");
appStateRemoved.attempts.put(attemptIdRemoved, null);
store.removeApplicationStateInternal(appStateRemoved);
try {
store.removeApplicationStateInternal(appStateRemoved);
} catch (KeeperException.NoNodeException nne) {
Assert.fail("NoNodeException should not happen.");
}
store.close();
}
}

0 comments on commit 4c7b9b6

Please sign in to comment.