Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-24562: Stabilize master startup with meta replicas enabled #1997

Merged
merged 3 commits into from
Jul 3, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1131,7 +1131,11 @@ private void finishActiveMasterInitialization(MonitoredTask status)
assignmentManager.checkIfShouldMoveSystemRegionAsync();
status.setStatus("Assign meta replicas");
MasterMetaBootstrap metaBootstrap = createMetaBootstrap();
metaBootstrap.assignMetaReplicas();
try {
metaBootstrap.assignMetaReplicas();
} catch (IOException | KeeperException e){
LOG.error("Assigning meta replica failed: ", e);
}
status.setStatus("Starting quota manager");
initQuotaManager();
if (QuotaUtil.isQuotaEnabled(conf)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ void assignMetaReplicas()
// down hosting server which calls AM#stop.
if (metaState != null && metaState.getServerName() != null) {
// Try to retain old assignment.
assignmentManager.assign(hri, metaState.getServerName());
assignmentManager.assignAsync(hri, metaState.getServerName());
} else {
assignmentManager.assign(hri);
assignmentManager.assignAsync(hri);
}
}
unassignExcessMetaReplica(numReplicas);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -589,9 +589,9 @@ private void preTransitCheck(RegionStateNode regionNode, RegionState.State[] exp
}
}

// TODO: Need an async version of this for hbck2.
public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
// TODO: should we use getRegionStateNode?
private TransitRegionStateProcedure createAssignProcedure(RegionInfo regionInfo, ServerName sn)
throws IOException {
// TODO: should we use getRegionStateNode?
RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo);
TransitRegionStateProcedure proc;
regionNode.lock();
Expand All @@ -602,6 +602,12 @@ public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
} finally {
regionNode.unlock();
}
return proc;
}

// TODO: Need an async version of this for hbck2.
public long assign(RegionInfo regionInfo, ServerName sn) throws IOException {
TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn);
ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc);
return proc.getProcId();
}
Expand All @@ -610,6 +616,28 @@ public long assign(RegionInfo regionInfo) throws IOException {
return assign(regionInfo, null);
}

/**
* Submits a procedure that assigns a region to a target server without waiting for it to finish
* @param regionInfo the region we would like to assign
* @param sn target server name
* @return
* @throws IOException
*/
public Future<byte[]> assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException {
TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn);
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
}

/**
* Submits a procedure that assigns a region without waiting for it to finish
* @param regionInfo the region we would like to assign
* @return
* @throws IOException
*/
public Future<byte[]> assignAsync(RegionInfo regionInfo) throws IOException {
return assignAsync(regionInfo, null);
}

public long unassign(RegionInfo regionInfo) throws IOException {
RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
if (regionNode == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Abortable;
Expand All @@ -41,9 +43,13 @@
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.util.Bytes;
Expand All @@ -52,7 +58,9 @@
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
import org.apache.zookeeper.KeeperException;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Rule;
Expand Down Expand Up @@ -350,4 +358,73 @@ public void testShutdownOfReplicaHolder() throws Exception {
assertNotEquals(3, i);
}
}

@Test
public void testFailedReplicaAssigment() throws InterruptedException, IOException {
//using our rigged master, to force a failed meta replica assignment
TEST_UTIL.getMiniHBaseCluster().getConfiguration().setClass(HConstants.MASTER_IMPL,
BrokenMetaReplicaMaster.class, HMaster.class);
TEST_UTIL.getMiniHBaseCluster().stopMaster(0).join();
HMaster newMaster = TEST_UTIL.getMiniHBaseCluster().startMaster().getMaster();
//waiting for master to come up
TEST_UTIL.waitFor(30000, () -> newMaster.isInitialized());
TEST_UTIL.getMiniHBaseCluster().getConfiguration().unset(HConstants.MASTER_IMPL);

AssignmentManager am = newMaster.getAssignmentManager();
//showing one of the replicas got assigned
RegionInfo metaReplicaHri = RegionReplicaUtil.getRegionInfoForReplica(
RegionInfoBuilder.FIRST_META_REGIONINFO, 1);
TEST_UTIL.waitFor(30000, () ->
am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri) != null &&
am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri).getRegionLocation()
!= null);
RegionStateNode metaReplicaRegionNode =
am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri);
Assert.assertNotNull(metaReplicaRegionNode.getRegionLocation());
//showing one of the replicas failed to be assigned
RegionInfo metaReplicaHri2 = RegionReplicaUtil.getRegionInfoForReplica(
RegionInfoBuilder.FIRST_META_REGIONINFO, 2);
RegionStateNode metaReplicaRegionNode2 =
am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri2);
Assert.assertNull(metaReplicaRegionNode2.getRegionLocation());

//showing master is active and running
Assert.assertFalse(newMaster.isStopping());
Assert.assertFalse(newMaster.isStopped());
Assert.assertTrue(newMaster.isActiveMaster());
}

public static class BrokenTransitRegionStateProcedure extends TransitRegionStateProcedure {
protected BrokenTransitRegionStateProcedure() {
//super(env, hri, assignCandidate, forceNewPlan, type);
super(null, null, null, false,TransitionType.ASSIGN);
}
}

public static class BrokenMetaReplicaMaster extends HMaster{
public BrokenMetaReplicaMaster(final Configuration conf) throws IOException, KeeperException {
super(conf);
}

@Override
public AssignmentManager createAssignmentManager(MasterServices master) {
return new BrokenMasterMetaAssignmentManager(master);
}
}

public static class BrokenMasterMetaAssignmentManager extends AssignmentManager{
MasterServices master;
public BrokenMasterMetaAssignmentManager(final MasterServices master) {
super(master);
this.master = master;
}

public Future<byte[]> assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException {
RegionStateNode regionNode = getRegionStates().getOrCreateRegionStateNode(regionInfo);
if (regionNode.getRegionInfo().getReplicaId() == 2) {
regionNode.setProcedure(new BrokenTransitRegionStateProcedure());
}
return super.assignAsync(regionInfo, sn);
}
}
}