Skip to content

Commit

Permalink
master hangs forever if RecoverMeta send assign meta region request t…
Browse files Browse the repository at this point in the history
…o target server fail
  • Loading branch information
Yi Liang authored and saintstack committed Dec 14, 2017
1 parent ba5f9ac commit d3aeaef
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 2 deletions.
Expand Up @@ -572,6 +572,10 @@ public synchronized void expireServer(final ServerName serverName) {
if (!master.isServerCrashProcessingEnabled()) {
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
+ "delay expiring server " + serverName);
// Even we delay expire this server, we still need to handle Meta's RIT
// that are against the crashed server; since when we do RecoverMetaProcedure,
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
this.queuedDeadServers.add(serverName);
return;
}
Expand Down
Expand Up @@ -48,6 +48,7 @@
import org.apache.hadoop.hbase.YouAreDeadException;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.TableState;
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
Expand All @@ -70,6 +71,7 @@
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
Expand All @@ -78,6 +80,7 @@
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
Expand Down Expand Up @@ -1322,7 +1325,7 @@ public int getNumRegionsOpened() {
}

public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName);
boolean carryingMeta = isCarryingMeta(serverName);
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName,
shouldSplitWal, carryingMeta));
Expand Down Expand Up @@ -1853,4 +1856,31 @@ public void killRegionServer(final ServerStateNode serverNode) {
}*/
master.getServerManager().expireServer(serverNode.getServerName());
}

/**
* Handle RIT of meta region against crashed server
* Only used when ServerCrashProcedure is not enabled.
*
* @param serverName Server that has already crashed
*/
public void handleMetaRITOnCrashedServer(ServerName serverName) {
RegionInfo hri = RegionReplicaUtil
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
RegionInfo.DEFAULT_REPLICA_ID);
RegionState regionStateNode = getRegionStates().getRegionState(hri);
if (!regionStateNode.getServerName().equals(serverName)) {
return;
}
// meta has been assigned to crashed server.
LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign");
// handle failure and wake event
RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
// Not need to consider for REGION_TRANSITION_QUEUE step
if (rtp != null && rtp.isMeta()
&& rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
LOG.info("Re-do rit procedure: " + rtp.toString());
rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
new ServerCrashException(rtp.getProcId(), serverName));
}
}
}
Expand Up @@ -212,9 +212,14 @@ public TableDescriptors getTableDescriptors() {
return null;
}

private boolean serverCrashProcessingEnabled = true;

public void setServerCrashProcessingEnabled(boolean b) {
serverCrashProcessingEnabled = b;
}
@Override
public boolean isServerCrashProcessingEnabled() {
return true;
return serverCrashProcessingEnabled;
}

@Override
Expand Down
Expand Up @@ -21,6 +21,7 @@

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.SortedSet;
Expand Down Expand Up @@ -173,6 +174,30 @@ public void start(final int numServes, final RSProcedureDispatcher remoteDispatc
this.procedureExecutor.getEnvironment().setEventReady(initialized, true);
}

/**
* Call this restart method only after running MockMasterServices#start()
* The RSs can be differentiated by the port number, see
* ServerName in MockMasterServices#start() method above.
* Restart of region server will have new startcode in server name
*
* @param serverName Server name to be restarted
*/
public void restartRegionServer(ServerName serverName) throws IOException {
List<ServerName> onlineServers = serverManager.getOnlineServersList();
long startCode = -1;
for (ServerName s : onlineServers) {
if (s.getAddress().equals(serverName.getAddress())) {
startCode = s.getStartcode() + 1;
break;
}
}
if (startCode == -1) {
return;
}
ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode);
serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD);
}

@Override
public void stop(String why) {
stopProcedureExecutor();
Expand Down
Expand Up @@ -444,6 +444,34 @@ public void testUnassignAnUnassignedRegion() throws Exception {
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
}

/**
* It is possible that when AM send assign meta request to a RS successfully,
* but RS can not send back any response, which cause master startup hangs forever
*/
@Test
public void testAssignMetaAndCrashBeforeResponse() throws Exception {
tearDown();
// See setUp(), start HBase until set up meta
UTIL = new HBaseTestingUtility();
this.executor = Executors.newSingleThreadScheduledExecutor();
setupConfiguration(UTIL.getConfiguration());
master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
rsDispatcher = new MockRSProcedureDispatcher(master);
master.start(NSERVERS, rsDispatcher);
am = master.getAssignmentManager();

// Assign meta
master.setServerCrashProcessingEnabled(false);
rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor());
am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
assertEquals(true, am.isMetaInitialized());

// set it back as default, see setUpMeta()
master.setServerCrashProcessingEnabled(true);
am.wakeMetaLoadedEvent();
am.setFailoverCleanupDone(true);
}

private Future<byte[]> submitProcedure(final Procedure proc) {
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
}
Expand Down Expand Up @@ -527,6 +555,14 @@ private void doCrash(final ServerName serverName) {
this.am.submitServerCrash(serverName, false/*No WALs here*/);
}

private void doRestart(final ServerName serverName) {
try {
this.master.restartRegionServer(serverName);
} catch (IOException e) {
LOG.warn("Can not restart RS with new startcode");
}
}

private class NoopRsExecutor implements MockRSExecutor {
public ExecuteProceduresResponse sendRequest(ServerName server,
ExecuteProceduresRequest request) throws IOException {
Expand Down Expand Up @@ -678,6 +714,37 @@ public void run() {
}
}

/**
* Takes open request and then returns nothing so acts like a RS that went zombie.
* No response (so proc is stuck/suspended on the Master and won't wake up.).
* Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create
* ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
* will restart RS directly, situation for RS crashed when SCP is not enabled.
*/
private class HangThenRSRestartExecutor extends GoodRsExecutor {
private int invocations;

@Override
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
throws IOException {
if (this.invocations++ > 0) {
// Return w/o problem the second time through here.
return super.execOpenRegion(server, openReq);
}
// The procedure on master will just hang forever because nothing comes back
// from the RS in this case.
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
executor.schedule(new Runnable() {
@Override
public void run() {
LOG.info("Restarting RS of " + server);
doRestart(server);
}
}, 1, TimeUnit.SECONDS);
return null;
}
}

private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
public static final int TYPES_OF_FAILURE = 6;
private int invocations;
Expand Down

0 comments on commit d3aeaef

Please sign in to comment.