Skip to content

Commit

Permalink
HBASE-24562: Stabilize master startup with meta replicas enabled (#1903)
Browse files Browse the repository at this point in the history
Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
Signed-off-by: Huaxiang Sun <huaxiangsun@apache.com>
(cherry picked from commit 8cdb2cc)
  • Loading branch information
BukrosSzabolcs authored and wchevreuil committed Jun 26, 2020
1 parent a6c8870 commit 50d1a79
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1140,7 +1140,11 @@ private void finishActiveMasterInitialization(MonitoredTask status)
assignmentManager.checkIfShouldMoveSystemRegionAsync();
status.setStatus("Assign meta replicas");
MasterMetaBootstrap metaBootstrap = createMetaBootstrap();
metaBootstrap.assignMetaReplicas();
try {
metaBootstrap.assignMetaReplicas();
} catch (IOException | KeeperException e){
LOG.error("Assigning meta replica failed: ", e);
}
status.setStatus("Starting quota manager");
initQuotaManager();
if (QuotaUtil.isQuotaEnabled(conf)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ void assignMetaReplicas()
// down hosting server which calls AM#stop.
if (metaState != null && metaState.getServerName() != null) {
// Try to retain old assignment.
assignmentManager.assign(hri, metaState.getServerName());
assignmentManager.assignAsync(hri, metaState.getServerName());
} else {
assignmentManager.assign(hri);
assignmentManager.assignAsync(hri);
}
}
unassignExcessMetaReplica(numReplicas);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -589,9 +589,9 @@ private void preTransitCheck(RegionStateNode regionNode, RegionState.State[] exp
}
}

// TODO: Need an async version of this for hbck2.
public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
// TODO: should we use getRegionStateNode?
private TransitRegionStateProcedure createAssignProcedure(RegionInfo regionInfo, ServerName sn)
throws IOException {
// TODO: should we use getRegionStateNode?
RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo);
TransitRegionStateProcedure proc;
regionNode.lock();
Expand All @@ -602,6 +602,12 @@ public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
} finally {
regionNode.unlock();
}
return proc;
}

// TODO: Need an async version of this for hbck2.
public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn);
ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc);
return proc.getProcId();
}
Expand All @@ -610,6 +616,28 @@ public long assign(RegionInfo regionInfo) throws IOException {
return assign(regionInfo, null);
}

/**
* Submits a procedure that assigns a region to a target server without waiting for it to finish
* @param regionInfo the region we would like to assign
* @param sn target server name
* @return
* @throws IOException
*/
public Future<byte[]> assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException {
TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn);
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
}

/**
* Submits a procedure that assigns a region without waiting for it to finish
* @param regionInfo the region we would like to assign
* @return
* @throws IOException
*/
public Future<byte[]> assignAsync(RegionInfo regionInfo) throws IOException {
return assignAsync(regionInfo, null);
}

public long unassign(RegionInfo regionInfo) throws IOException {
RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
if (regionNode == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Abortable;
Expand All @@ -42,9 +44,12 @@
import org.apache.hadoop.hbase.StartMiniClusterOption;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.util.Bytes;
Expand All @@ -54,6 +59,7 @@
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Rule;
Expand All @@ -63,6 +69,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;

/**
* Tests the scenarios where replicas are enabled for the meta table
*/
Expand Down Expand Up @@ -163,15 +171,15 @@ public void testZookeeperNodesForReplicas() throws Exception {
conf.get("zookeeper.znode.metaserver", "meta-region-server"));
// check that the data in the znode is parseable (this would also mean the znode exists)
byte[] data = ZKUtil.getData(zkw, primaryMetaZnode);
ProtobufUtil.toServerName(data);
ProtobufUtil.parseServerNameFrom(data);
for (int i = 1; i < 3; i++) {
String secZnode = ZNodePaths.joinZNode(baseZNode,
conf.get("zookeeper.znode.metaserver", "meta-region-server") + "-" + i);
String str = zkw.getZNodePaths().getZNodeForReplica(i);
assertTrue(str.equals(secZnode));
// check that the data in the znode is parseable (this would also mean the znode exists)
data = ZKUtil.getData(zkw, secZnode);
ProtobufUtil.toServerName(data);
ProtobufUtil.parseServerNameFrom(data);
}
}

Expand All @@ -198,7 +206,7 @@ public static void shutdownMetaAndDoValidations(HBaseTestingUtility util) throws
String primaryMetaZnode = ZNodePaths.joinZNode(baseZNode,
conf.get("zookeeper.znode.metaserver", "meta-region-server"));
byte[] data = ZKUtil.getData(zkw, primaryMetaZnode);
ServerName primary = ProtobufUtil.toServerName(data);
ServerName primary = ProtobufUtil.parseServerNameFrom(data);
LOG.info("Primary=" + primary.toString());

TableName TABLE = TableName.valueOf("testShutdownHandling");
Expand Down Expand Up @@ -304,7 +312,7 @@ public void testMetaAddressChange() throws Exception {
conf.get("zookeeper.znode.metaserver", "meta-region-server"));
// check that the data in the znode is parseable (this would also mean the znode exists)
byte[] data = ZKUtil.getData(zkw, primaryMetaZnode);
ServerName currentServer = ProtobufUtil.toServerName(data);
ServerName currentServer = ProtobufUtil.parseServerNameFrom(data);
Collection<ServerName> liveServers = TEST_UTIL.getAdmin()
.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)).getLiveServerMetrics().keySet();
ServerName moveToServer = null;
Expand All @@ -326,7 +334,7 @@ public void testMetaAddressChange() throws Exception {
do {
Thread.sleep(10);
data = ZKUtil.getData(zkw, primaryMetaZnode);
currentServer = ProtobufUtil.toServerName(data);
currentServer = ProtobufUtil.parseServerNameFrom(data);
i++;
} while (!moveToServer.equals(currentServer) && i < max); //wait for 10 seconds overall
assertNotEquals(max, i);
Expand All @@ -353,4 +361,67 @@ public void testShutdownOfReplicaHolder() throws Exception {
assertNotEquals(3, i);
}
}

@Test
public void testFailedReplicaAssigment() throws InterruptedException, IOException {
//using our rigged master, to force a failed meta replica assignment
TEST_UTIL.getMiniHBaseCluster().getConfiguration().setClass(HConstants.MASTER_IMPL, BrokenMetaReplicaMaster.class, HMaster.class);
TEST_UTIL.getMiniHBaseCluster().stopMaster(0).join();
HMaster newMaster = TEST_UTIL.getMiniHBaseCluster().startMaster().getMaster();
//waiting for master to come up
TEST_UTIL.waitFor(30000, () -> newMaster.isInitialized());
TEST_UTIL.getMiniHBaseCluster().getConfiguration().unset(HConstants.MASTER_IMPL);


AssignmentManager am = newMaster.getAssignmentManager();
//showing one of the replicas got assigned
RegionInfo metaReplicaHri = RegionReplicaUtil.getRegionInfoForReplica(
RegionInfoBuilder.FIRST_META_REGIONINFO, 1);
RegionStateNode metaReplicaRegionNode = am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri);
Assert.assertNotNull(metaReplicaRegionNode.getRegionLocation());
//showing one of the replicas failed to be assigned
RegionInfo metaReplicaHri2 = RegionReplicaUtil.getRegionInfoForReplica(
RegionInfoBuilder.FIRST_META_REGIONINFO, 2);
RegionStateNode metaReplicaRegionNode2 = am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri2);
Assert.assertNull(metaReplicaRegionNode2.getRegionLocation());

//showing master is active and running
Assert.assertFalse(newMaster.isStopping());
Assert.assertFalse(newMaster.isStopped());
Assert.assertTrue(newMaster.isActiveMaster());
}

public static class BrokenTransitRegionStateProcedure extends TransitRegionStateProcedure {
protected BrokenTransitRegionStateProcedure() {
//super(env, hri, assignCandidate, forceNewPlan, type);
super(null, null, null, false,TransitionType.ASSIGN);
}
}

public static class BrokenMetaReplicaMaster extends HMaster{
public BrokenMetaReplicaMaster(final Configuration conf) throws IOException {
super(conf);
}

@Override
public AssignmentManager createAssignmentManager(MasterServices master) {
return new BrokenMasterMetaAssignmentManager(master);
}
}

public static class BrokenMasterMetaAssignmentManager extends AssignmentManager{
MasterServices master;
public BrokenMasterMetaAssignmentManager(final MasterServices master) {
super(master);
this.master = master;
}

public Future<byte[]> assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException {
RegionStateNode regionNode = getRegionStates().getOrCreateRegionStateNode(regionInfo);
if (regionNode.getRegionInfo().getReplicaId() == 2) {
regionNode.setProcedure(new BrokenTransitRegionStateProcedure());
}
return super.assignAsync(regionInfo, sn);
}
}
}

0 comments on commit 50d1a79

Please sign in to comment.