From 3823c7b9f4e67d9a27da765a06411c533030ba80 Mon Sep 17 00:00:00 2001 From: BukrosSzabolcs Date: Wed, 24 Jun 2020 19:38:36 +0200 Subject: [PATCH 1/3] HBASE-24562: Stabilize master startup with meta replicas enabled (#1903) Signed-off-by: Wellington Chevreuil Signed-off-by: Huaxiang Sun --- .../apache/hadoop/hbase/master/HMaster.java | 6 +- .../hbase/master/MasterMetaBootstrap.java | 4 +- .../master/assignment/AssignmentManager.java | 34 +++++++- .../hbase/client/TestMetaWithReplicas.java | 77 +++++++++++++++++++ 4 files changed, 115 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index e87560c8c308..0dc29974710d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1131,7 +1131,11 @@ private void finishActiveMasterInitialization(MonitoredTask status) assignmentManager.checkIfShouldMoveSystemRegionAsync(); status.setStatus("Assign meta replicas"); MasterMetaBootstrap metaBootstrap = createMetaBootstrap(); - metaBootstrap.assignMetaReplicas(); + try { + metaBootstrap.assignMetaReplicas(); + } catch (IOException | KeeperException e){ + LOG.error("Assigning meta replica failed: ", e); + } status.setStatus("Starting quota manager"); initQuotaManager(); if (QuotaUtil.isQuotaEnabled(conf)) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java index e57817e573ac..1cf6cf1be101 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java @@ -81,9 +81,9 @@ void assignMetaReplicas() // down hosting server which calls AM#stop. if (metaState != null && metaState.getServerName() != null) { // Try to retain old assignment. - assignmentManager.assign(hri, metaState.getServerName()); + assignmentManager.assignAsync(hri, metaState.getServerName()); } else { - assignmentManager.assign(hri); + assignmentManager.assignAsync(hri); } } unassignExcessMetaReplica(numReplicas); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 9a5796fa5b08..588c3052d18d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -589,9 +589,9 @@ private void preTransitCheck(RegionStateNode regionNode, RegionState.State[] exp } } - // TODO: Need an async version of this for hbck2. - public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { - // TODO: should we use getRegionStateNode? + private TransitRegionStateProcedure createAssignProcedure(RegionInfo regionInfo, ServerName sn) + throws IOException { + // TODO: should we use getRegionStateNode? RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo); TransitRegionStateProcedure proc; regionNode.lock(); @@ -602,6 +602,12 @@ public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { } finally { regionNode.unlock(); } + return proc; + } + + // TODO: Need an async version of this for hbck2. + public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { + TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn); ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc); return proc.getProcId(); } @@ -610,6 +616,28 @@ public long assign(RegionInfo regionInfo) throws IOException { return assign(regionInfo, null); } + /** + * Submits a procedure that assigns a region to a target server without waiting for it to finish + * @param regionInfo the region we would like to assign + * @param sn target server name + * @return + * @throws IOException + */ + public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { + TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn); + return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc); + } + + /** + * Submits a procedure that assigns a region without waiting for it to finish + * @param regionInfo the region we would like to assign + * @return + * @throws IOException + */ + public Future assignAsync(RegionInfo regionInfo) throws IOException { + return assignAsync(regionInfo, null); + } + public long unassign(RegionInfo regionInfo) throws IOException { RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo); if (regionNode == null) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java index ddf568817d12..e8808e7438b5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java @@ -23,12 +23,14 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Abortable; @@ -41,9 +43,13 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableNotFoundException; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.master.assignment.RegionStateNode; +import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; @@ -52,7 +58,9 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.apache.zookeeper.KeeperException; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.ClassRule; import org.junit.Rule; @@ -350,4 +358,73 @@ public void testShutdownOfReplicaHolder() throws Exception { assertNotEquals(3, i); } } + + @Test + public void testFailedReplicaAssigment() throws InterruptedException, IOException { + //using our rigged master, to force a failed meta replica assignment + TEST_UTIL.getMiniHBaseCluster().getConfiguration().setClass(HConstants.MASTER_IMPL, + BrokenMetaReplicaMaster.class, HMaster.class); + TEST_UTIL.getMiniHBaseCluster().stopMaster(0).join(); + HMaster newMaster = TEST_UTIL.getMiniHBaseCluster().startMaster().getMaster(); + //waiting for master to come up + TEST_UTIL.waitFor(30000, () -> newMaster.isInitialized()); + TEST_UTIL.getMiniHBaseCluster().getConfiguration().unset(HConstants.MASTER_IMPL); + + AssignmentManager am = newMaster.getAssignmentManager(); + //showing one of the replicas got assigned + RegionInfo metaReplicaHri = RegionReplicaUtil.getRegionInfoForReplica( + RegionInfoBuilder.FIRST_META_REGIONINFO, 1); + TEST_UTIL.waitFor(30000, () -> + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri) != null && + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri).getRegionLocation() + != null); + RegionStateNode metaReplicaRegionNode = + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri); + Assert.assertNotNull(metaReplicaRegionNode.getRegionLocation()); + //showing one of the replicas failed to be assigned + RegionInfo metaReplicaHri2 = RegionReplicaUtil.getRegionInfoForReplica( + RegionInfoBuilder.FIRST_META_REGIONINFO, 2); + RegionStateNode metaReplicaRegionNode2 = + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri2); + Assert.assertNull(metaReplicaRegionNode2.getRegionLocation()); + + //showing master is active and running + Assert.assertFalse(newMaster.isStopping()); + Assert.assertFalse(newMaster.isStopped()); + Assert.assertTrue(newMaster.isActiveMaster()); + } + + public static class BrokenTransitRegionStateProcedure extends TransitRegionStateProcedure { + protected BrokenTransitRegionStateProcedure() { + //super(env, hri, assignCandidate, forceNewPlan, type); + super(null, null, null, false,TransitionType.ASSIGN); + } + } + + public static class BrokenMetaReplicaMaster extends HMaster{ + public BrokenMetaReplicaMaster(final Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + public AssignmentManager createAssignmentManager(MasterServices master) { + return new BrokenMasterMetaAssignmentManager(master); + } + } + + public static class BrokenMasterMetaAssignmentManager extends AssignmentManager{ + MasterServices master; + public BrokenMasterMetaAssignmentManager(final MasterServices master) { + super(master); + this.master = master; + } + + public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { + RegionStateNode regionNode = getRegionStates().getOrCreateRegionStateNode(regionInfo); + if (regionNode.getRegionInfo().getReplicaId() == 2) { + regionNode.setProcedure(new BrokenTransitRegionStateProcedure()); + } + return super.assignAsync(regionInfo, sn); + } + } } From 8837350c9235d4754de91b22f6f655b6de10e94d Mon Sep 17 00:00:00 2001 From: Bukros Szabolcs Date: Wed, 1 Jul 2020 11:22:16 +0200 Subject: [PATCH 2/3] HBASE-24562: Stabilize master startup with meta replicas enabled fix checkstyle --- .../hadoop/hbase/master/assignment/AssignmentManager.java | 4 ++-- .../org/apache/hadoop/hbase/client/TestMetaWithReplicas.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 588c3052d18d..d5b3c23a23d7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -620,7 +620,7 @@ public long assign(RegionInfo regionInfo) throws IOException { * Submits a procedure that assigns a region to a target server without waiting for it to finish * @param regionInfo the region we would like to assign * @param sn target server name - * @return + * @return submitProcedure * @throws IOException */ public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { @@ -631,7 +631,7 @@ public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws I /** * Submits a procedure that assigns a region without waiting for it to finish * @param regionInfo the region we would like to assign - * @return + * @return submitProcedure * @throws IOException */ public Future assignAsync(RegionInfo regionInfo) throws IOException { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java index e8808e7438b5..cbfac2875456 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java @@ -47,9 +47,9 @@ import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil; -import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; +import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; From b9b270d8e52cdadefede301e3692774e8fc938a3 Mon Sep 17 00:00:00 2001 From: Bukros Szabolcs Date: Thu, 2 Jul 2020 09:56:47 +0200 Subject: [PATCH 3/3] HBASE-24562: Stabilize master startup with meta replicas enabled checkstyle fix --- .../hadoop/hbase/master/assignment/AssignmentManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index d5b3c23a23d7..8afe691e7c80 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -621,7 +621,7 @@ public long assign(RegionInfo regionInfo) throws IOException { * @param regionInfo the region we would like to assign * @param sn target server name * @return submitProcedure - * @throws IOException + * @throws IOException if preTransitCheck fails */ public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn); @@ -632,7 +632,7 @@ public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws I * Submits a procedure that assigns a region without waiting for it to finish * @param regionInfo the region we would like to assign * @return submitProcedure - * @throws IOException + * @throws IOException if preTransitCheck fails */ public Future assignAsync(RegionInfo regionInfo) throws IOException { return assignAsync(regionInfo, null);