From 38295e30e32bccb54b42015734873af40cdab8b6 Mon Sep 17 00:00:00 2001 From: Tomas Hofman Date: Mon, 3 Sep 2018 15:47:03 +0200 Subject: [PATCH] ARTEMIS-2069 Backup doesn't activate after shared store is reconnected --- ...tiveMQLockAcquisitionTimeoutException.java | 29 ++++ .../core/server/impl/FileLockNodeManager.java | 69 ++++---- .../byteman/FileLockNodeManagerTest.java | 86 ++++++++++ .../SharedStoreBackupActivationTest.java | 150 ++++++++++++++++++ .../cluster/failover/FailoverTestBase.java | 4 + 5 files changed, 309 insertions(+), 29 deletions(-) create mode 100644 artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java create mode 100644 tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java create mode 100644 tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java new file mode 100644 index 00000000000..af6d5c215e7 --- /dev/null +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/ActiveMQLockAcquisitionTimeoutException.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.core.server; + +import org.apache.activemq.artemis.api.core.ActiveMQException; + +public class ActiveMQLockAcquisitionTimeoutException extends ActiveMQException { + + public ActiveMQLockAcquisitionTimeoutException() { + } + + public ActiveMQLockAcquisitionTimeoutException(String msg) { + super(msg); + } +} diff --git a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java index 73156eee743..2044189c177 100644 --- a/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java +++ b/artemis-server/src/main/java/org/apache/activemq/artemis/core/server/impl/FileLockNodeManager.java @@ -24,6 +24,7 @@ import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException; import org.apache.activemq.artemis.api.core.SimpleString; import org.apache.activemq.artemis.core.server.ActivateCallback; +import org.apache.activemq.artemis.core.server.ActiveMQLockAcquisitionTimeoutException; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.utils.UUID; @@ -49,6 +50,8 @@ public class FileLockNodeManager extends NodeManager { private static final byte NOT_STARTED = 'N'; + private static final long LOCK_ACCESS_FAILURE_WAIT_TIME = 2000; + private FileLock liveLock; private FileLock backupLock; @@ -299,44 +302,52 @@ protected FileLock tryLock(final long lockPos) throws IOException { protected FileLock lock(final long lockPosition) throws Exception { long start = System.currentTimeMillis(); + boolean isRecurringFailure = false; while (!interrupted) { - FileLock lock = tryLock(lockPosition); - - if (lock == null) { - try { - Thread.sleep(500); - } catch (InterruptedException e) { - return null; + try { + FileLock lock = tryLock(lockPosition); + isRecurringFailure = false; + + if (lock == null) { + try { + Thread.sleep(500); + } catch (InterruptedException e) { + return null; + } + + if (lockAcquisitionTimeout != -1 && (System.currentTimeMillis() - start) > lockAcquisitionTimeout) { + throw new ActiveMQLockAcquisitionTimeoutException("timed out waiting for lock"); + } + } else { + return lock; } - - if (lockAcquisitionTimeout != -1 && (System.currentTimeMillis() - start) > lockAcquisitionTimeout) { - throw new Exception("timed out waiting for lock"); + } catch (IOException e) { + // IOException during trylock() may be a temporary issue, e.g. NFS volume not being accessible + + logger.log(isRecurringFailure ? Logger.Level.DEBUG : Logger.Level.WARN, + "Failure when accessing a lock file", e); + isRecurringFailure = true; + + long waitTime = LOCK_ACCESS_FAILURE_WAIT_TIME; + if (lockAcquisitionTimeout != -1) { + final long remainingTime = lockAcquisitionTimeout - (System.currentTimeMillis() - start); + if (remainingTime <= 0) { + throw new ActiveMQLockAcquisitionTimeoutException("timed out waiting for lock"); + } + waitTime = Math.min(waitTime, remainingTime); } - } else { - return lock; - } - } - // todo this is here because sometimes channel.lock throws a resource deadlock exception but trylock works, - // need to investigate further and review - FileLock lock; - do { - lock = tryLock(lockPosition); - if (lock == null) { try { - Thread.sleep(500); - } catch (InterruptedException e1) { - // + Thread.sleep(waitTime); + } catch (InterruptedException interrupt) { + return null; } } - if (interrupted) { - interrupted = false; - throw new IOException("Lock was interrupted"); - } } - while (lock == null); - return lock; + + // presumed interrupted + return null; } } diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java new file mode 100644 index 00000000000..4a740184286 --- /dev/null +++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/FileLockNodeManagerTest.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.extras.byteman; + +import java.io.File; +import java.io.IOException; + +import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager; +import org.jboss.byteman.contrib.bmunit.BMRule; +import org.jboss.byteman.contrib.bmunit.BMRules; +import org.jboss.byteman.contrib.bmunit.BMUnitRunner; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; + +@RunWith(BMUnitRunner.class) +public class FileLockNodeManagerTest { + + private static final int TIMEOUT_TOLERANCE = 50; + + private File sharedDir; + + public FileLockNodeManagerTest() throws IOException { + sharedDir = File.createTempFile("shared-dir", ""); + sharedDir.delete(); + Assert.assertTrue(sharedDir.mkdir()); + } + + @Test + @BMRules( + rules = {@BMRule( + name = "throw IOException during activation", + targetClass = "org.apache.activemq.artemis.core.server.impl.FileLockNodeManager", + targetMethod = "tryLock", + targetLocation = "AT ENTRY", + action = "THROW new IOException(\"IO Error\");") + }) + public void test() throws Exception { + measureLockAcquisisionTimeout(100); // warm-up + + assertMeasuredTimeoutFor(100); + assertMeasuredTimeoutFor(200); + } + + protected void assertMeasuredTimeoutFor(long lockAcquisitionTimeout) throws Exception { + long realTimeout = measureLockAcquisisionTimeout(lockAcquisitionTimeout); + System.out.println(String.format("lockAcquisisionTimeout: %d ms, measured timeout: %d ms", lockAcquisitionTimeout, realTimeout)); + Assert.assertTrue(String.format("Timeout %d ms was larger than expected %d ms", realTimeout, lockAcquisitionTimeout + TIMEOUT_TOLERANCE), + lockAcquisitionTimeout + TIMEOUT_TOLERANCE >= realTimeout); + Assert.assertTrue(String.format("Timeout %d ms was lower than expected %d ms", realTimeout, lockAcquisitionTimeout), + lockAcquisitionTimeout + TIMEOUT_TOLERANCE >= realTimeout); + } + + private long measureLockAcquisisionTimeout(long lockAcquisitionTimeout) throws Exception { + FileLockNodeManager manager = new FileLockNodeManager(sharedDir, false, lockAcquisitionTimeout); + manager.start(); + + // try to lock and measure real timeout + long start = System.currentTimeMillis(); + try { + manager.awaitLiveNode(); + } catch (Exception e) { + long stop = System.currentTimeMillis(); + if (!"timed out waiting for lock".equals(e.getMessage())) { + throw e; + } + return stop - start; + } + Assert.fail("Exception expected"); + return 0; + } +} diff --git a/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java new file mode 100644 index 00000000000..2df895d1f31 --- /dev/null +++ b/tests/extra-tests/src/test/java/org/apache/activemq/artemis/tests/extras/byteman/SharedStoreBackupActivationTest.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.activemq.artemis.tests.extras.byteman; + +import java.io.File; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import org.apache.activemq.artemis.api.core.TransportConfiguration; +import org.apache.activemq.artemis.core.config.ScaleDownConfiguration; +import org.apache.activemq.artemis.core.config.ha.SharedStoreMasterPolicyConfiguration; +import org.apache.activemq.artemis.core.config.ha.SharedStoreSlavePolicyConfiguration; +import org.apache.activemq.artemis.core.server.NodeManager; +import org.apache.activemq.artemis.core.server.impl.FileLockNodeManager; +import org.apache.activemq.artemis.tests.integration.cluster.failover.FailoverTestBase; +import org.apache.activemq.artemis.tests.util.TransportConfigurationUtils; +import org.jboss.byteman.contrib.bmunit.BMRule; +import org.jboss.byteman.contrib.bmunit.BMRules; +import org.jboss.byteman.contrib.bmunit.BMUnitRunner; +import org.jboss.logging.Logger; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; + +@RunWith(BMUnitRunner.class) +public class SharedStoreBackupActivationTest extends FailoverTestBase { + + private static Logger logger = Logger.getLogger(SharedStoreBackupActivationTest.class); + + private static volatile boolean throwException = false; + private static CountDownLatch exceptionThrownLatch; + + public static synchronized boolean isThrowException() { + logger.debugf("Throwing IOException during FileLockNodeManager.tryLock(): %s", throwException); + if (exceptionThrownLatch != null) { + exceptionThrownLatch.countDown(); + } + return throwException; + } + + /** + * Waits for the backup server to call FileLockNodeManager.tryLock(). + */ + public static void awaitTryLock(boolean throwException) throws InterruptedException { + synchronized (SharedStoreBackupActivationTest.class) { + SharedStoreBackupActivationTest.throwException = throwException; + exceptionThrownLatch = new CountDownLatch(1); + } + logger.debugf("Awaiting backup to perform FileLockNodeManager.tryLock()"); + boolean ret = exceptionThrownLatch.await(10, TimeUnit.SECONDS); + SharedStoreBackupActivationTest.throwException = false; + + Assert.assertTrue("FileLockNodeManager.tryLock() was not called during specified timeout", ret); + logger.debugf("Awaiting FileLockNodeManager.tryLock() done"); + } + + @Test + @BMRules( + rules = {@BMRule( + name = "throw IOException during activation", + targetClass = "org.apache.activemq.artemis.core.server.impl.FileLockNodeManager", + targetMethod = "tryLock", + targetLocation = "AT ENTRY", + condition = "org.apache.activemq.artemis.tests.extras.byteman.SharedStoreBackupActivationTest.isThrowException()", + action = "THROW new IOException(\"IO Error\");") + }) + public void testFailOverAfterTryLockException() throws Exception { + Assert.assertTrue(liveServer.isActive()); + Assert.assertFalse(backupServer.isActive()); + + // wait for backup to try to acquire lock, once without exception (acquiring will not succeed because live is + // still active) + awaitTryLock(false); + + // wait for backup to try to acquire lock, this time throw an IOException + logger.debug("Causing exception"); + awaitTryLock(true); + + // stop live server + logger.debugf("Stopping live server"); + liveServer.stop(); + waitForServerToStop(liveServer.getServer()); + logger.debugf("Live server stopped, waiting for backup activation"); + backupServer.getServer().waitForActivation(10, TimeUnit.SECONDS); + + // backup should be activated by now + Assert.assertFalse(liveServer.isActive()); + Assert.assertTrue("Backup server didn't activate", backupServer.isActive()); + } + + @Override + protected TransportConfiguration getAcceptorTransportConfiguration(final boolean live) { + return TransportConfigurationUtils.getInVMAcceptor(live); + } + + @Override + protected TransportConfiguration getConnectorTransportConfiguration(final boolean live) { + return TransportConfigurationUtils.getInVMConnector(live); + } + + @Override + protected void createConfigs() throws Exception { + File sharedDir = File.createTempFile("shared-dir", ""); + sharedDir.delete(); + Assert.assertTrue(sharedDir.mkdir()); + logger.debugf("Created shared store directory %s", sharedDir.getCanonicalPath()); + + TransportConfiguration liveConnector = getConnectorTransportConfiguration(true); + TransportConfiguration backupConnector = getConnectorTransportConfiguration(false); + + // nodes must use separate FileLockNodeManager instances! + NodeManager liveNodeManager = new FileLockNodeManager(sharedDir, false); + NodeManager backupNodeManager = new FileLockNodeManager(sharedDir, false); + + backupConfig = super.createDefaultConfig(false) + .clearAcceptorConfigurations() + .addAcceptorConfiguration(getAcceptorTransportConfiguration(false)) + .setHAPolicyConfiguration( + new SharedStoreSlavePolicyConfiguration() + .setScaleDownConfiguration(new ScaleDownConfiguration().setEnabled(false)) + .setRestartBackup(false)) + .addConnectorConfiguration(liveConnector.getName(), liveConnector) + .addConnectorConfiguration(backupConnector.getName(), backupConnector) + .addClusterConfiguration(basicClusterConnectionConfig(backupConnector.getName(), liveConnector.getName())); + backupServer = createTestableServer(backupConfig, backupNodeManager); + + liveConfig = super.createDefaultConfig(false) + .clearAcceptorConfigurations() + .addAcceptorConfiguration(getAcceptorTransportConfiguration(true)) + .setHAPolicyConfiguration(new SharedStoreMasterPolicyConfiguration().setFailoverOnServerShutdown(true)) + .addClusterConfiguration(basicClusterConnectionConfig(liveConnector.getName())) + .addConnectorConfiguration(liveConnector.getName(), liveConnector); + liveServer = createTestableServer(liveConfig, liveNodeManager); + } + +} diff --git a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java index 2a75f94eafd..e4d4711bd74 100644 --- a/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java +++ b/tests/integration-tests/src/test/java/org/apache/activemq/artemis/tests/integration/cluster/failover/FailoverTestBase.java @@ -113,6 +113,10 @@ protected void setLiveIdentity() { } protected TestableServer createTestableServer(Configuration config) throws Exception { + return createTestableServer(config, nodeManager); + } + + protected TestableServer createTestableServer(Configuration config, NodeManager nodeManager) throws Exception { boolean isBackup = config.getHAPolicyConfiguration() instanceof ReplicaPolicyConfiguration || config.getHAPolicyConfiguration() instanceof SharedStoreSlavePolicyConfiguration; return new SameProcessActiveMQServer(createInVMFailoverServer(true, config, nodeManager, isBackup ? 2 : 1)); }