From 27d958dea72ee3aa593a695de29f8bbb18d403eb Mon Sep 17 00:00:00 2001 From: Svetoslav Neykov Date: Wed, 15 Oct 2014 12:59:52 +0300 Subject: [PATCH] Fix a HA failover bug where the master tries to become a master again and fails. Instead of failing just log a warning, turns out this case can happen in a real-world scenario where the storage delays the writes of the master. --- .../entity/rebind/RebindManagerImpl.java | 6 +- .../ha/HighAvailabilityManagerImpl.java | 2 + ...HighAvailabilityManagerSplitBrainTest.java | 87 +++++++++++++++++++ .../ha/TestEntityFailingRebind.java | 44 ++++++++++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 core/src/test/java/brooklyn/management/ha/TestEntityFailingRebind.java diff --git a/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java b/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java index 745b7cb991..35be72991a 100644 --- a/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java +++ b/core/src/main/java/brooklyn/entity/rebind/RebindManagerImpl.java @@ -916,8 +916,10 @@ else if (wasReadOnly) return ManagementTransitionMode.REBINDING_BECOMING_PRIMARY; else if (isNowReadOnly) return ManagementTransitionMode.REBINDING_NO_LONGER_PRIMARY; - else - throw new IllegalStateException("Rebinding master not supported: "+item); + else { + LOG.warn("Transitioning to master, though never stopped being a master - " + item); + return ManagementTransitionMode.REBINDING_BECOMING_PRIMARY; + } } } diff --git a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java index c71d352cf2..fb339d4a47 100644 --- a/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java +++ b/core/src/main/java/brooklyn/management/ha/HighAvailabilityManagerImpl.java @@ -581,6 +581,8 @@ protected void checkMaster(boolean initializing) { LOG.debug("Master-change for this node only, demoting "+ownNodeRecord.toVerboseString()+" in favour of official master "+newMasterNodeRecord.toVerboseString()); demoteToStandby(BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_DEFAULT_STANDBY_IS_HOT_PROPERTY)); return; + } else { + LOG.debug("Detected master heartbeat timeout. Initiating a new master election. Master was " + currMasterNodeRecord); } // Need to choose a new master diff --git a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java index 48bdfa578d..0ca2c7f117 100644 --- a/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java +++ b/core/src/test/java/brooklyn/management/ha/HighAvailabilityManagerSplitBrainTest.java @@ -19,6 +19,8 @@ package brooklyn.management.ha; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.fail; import java.util.Collections; import java.util.Date; @@ -43,8 +45,10 @@ import brooklyn.entity.rebind.persister.PersistenceObjectStore; import brooklyn.internal.BrooklynFeatureEnablement; import brooklyn.location.Location; +import brooklyn.management.ha.TestEntityFailingRebind.RebindException; import brooklyn.management.internal.ManagementContextInternal; import brooklyn.test.Asserts; +import brooklyn.test.EntityTestUtils; import brooklyn.test.entity.LocalManagementContextForTests; import brooklyn.test.entity.TestApplication; import brooklyn.util.collections.MutableList; @@ -181,6 +185,89 @@ protected PersistenceObjectStore newPersistenceObjectStore() { return new InMemoryObjectStore(sharedBackingStore, sharedBackingStoreDates); } + @Test + public void testDoubleRebindFails() throws Exception { + useSharedTime(); + HaMgmtNode n1 = newNode(); + HaMgmtNode n2 = newNode(); + + // first auto should become master + n1.ha.start(HighAvailabilityMode.AUTO); + n2.ha.start(HighAvailabilityMode.AUTO); + + TestApplication app = ApplicationBuilder.newManagedApp( + EntitySpec.create(TestApplication.class).impl(TestEntityFailingRebind.class), n1.mgmt); + app.start(ImmutableList.of()); + + n1.mgmt.getRebindManager().forcePersistNow(); + + //don't publish state for a while (i.e. long store delays, failures) + sharedTickerAdvance(Duration.ONE_MINUTE); + + try { + n2.ha.publishAndCheck(false); + fail("n2 rebind failure expected"); + } catch (Exception e) { + assertNestedRebindException(e); + } + + try { + n1.ha.publishAndCheck(false); + fail("n1 rebind failure expected"); + } catch (Exception e) { + assertNestedRebindException(e); + } + + ManagementPlaneSyncRecord memento = n1.ha.getManagementPlaneSyncState(); + assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.FAILED); + assertEquals(memento.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.FAILED); + } + + @Test + public void testStandbyRebind() throws Exception { + useSharedTime(); + HaMgmtNode n1 = newNode(); + HaMgmtNode n2 = newNode(); + + // first auto should become master + n1.ha.start(HighAvailabilityMode.AUTO); + n2.ha.start(HighAvailabilityMode.AUTO); + + TestApplication app = ApplicationBuilder.newManagedApp( + EntitySpec.create(TestApplication.class).impl(TestEntityFailingRebind.class), n1.mgmt); + app.start(ImmutableList.of()); + + n1.mgmt.getRebindManager().forcePersistNow(); + + //don't publish state for a while (i.e. long store delays, failures) + sharedTickerAdvance(Duration.ONE_MINUTE); + + try { + n2.ha.publishAndCheck(false); + fail("n2 rebind failure expected"); + } catch (Exception e) { + assertNestedRebindException(e); + } + + TestEntityFailingRebind.setThrowOnRebind(false); + n1.ha.publishAndCheck(false); + + ManagementPlaneSyncRecord memento = n1.ha.getManagementPlaneSyncState(); + assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.MASTER); + assertEquals(memento.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.FAILED); + } + + private void assertNestedRebindException(Throwable t) { + Throwable ptr = t; + while (ptr != null) { + if (ptr instanceof RebindException) { + return; + } + ptr = ptr.getCause(); + } + Exceptions.propagate(t); + } + @Test public void testIfNodeStopsBeingAbleToWrite() throws Exception { useSharedTime(); diff --git a/core/src/test/java/brooklyn/management/ha/TestEntityFailingRebind.java b/core/src/test/java/brooklyn/management/ha/TestEntityFailingRebind.java new file mode 100644 index 0000000000..074cddec53 --- /dev/null +++ b/core/src/test/java/brooklyn/management/ha/TestEntityFailingRebind.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package brooklyn.management.ha; + +import brooklyn.test.entity.TestApplicationImpl; + +public class TestEntityFailingRebind extends TestApplicationImpl { + public static class RebindException extends RuntimeException { + private static final long serialVersionUID = 1L; + + public RebindException(String message) { + super(message); + } + } + + private static boolean throwOnRebind = true; + public static void setThrowOnRebind(boolean state) { + throwOnRebind = state; + } + + @Override + public void rebind() { + if (throwOnRebind) { + throw new RebindException("Intentional exception thrown when rebinding " + this); + } + } + +}