From d3aeaeffa455f32b1eb2fbb871b39707457f684d Mon Sep 17 00:00:00 2001 From: Yi Liang Date: Wed, 13 Dec 2017 13:49:59 -0800 Subject: [PATCH] master hangs forever if RecoverMeta send assign meta region request to target server fail --- .../hadoop/hbase/master/ServerManager.java | 4 ++ .../master/assignment/AssignmentManager.java | 32 ++++++++- .../hbase/master/MockNoopMasterServices.java | 7 +- .../master/assignment/MockMasterServices.java | 25 +++++++ .../assignment/TestAssignmentManager.java | 67 +++++++++++++++++++ 5 files changed, 133 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 79ffc8a5825f..78661d253ef8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -572,6 +572,10 @@ public synchronized void expireServer(final ServerName serverName) { if (!master.isServerCrashProcessingEnabled()) { LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " + "delay expiring server " + serverName); + // Even we delay expire this server, we still need to handle Meta's RIT + // that are against the crashed server; since when we do RecoverMetaProcedure, + // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287 + master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName); this.queuedDeadServers.add(serverName); return; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index cebe0b046583..7a1c8afbc168 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.YouAreDeadException; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.favored.FavoredNodesManager; @@ -70,6 +71,7 @@ import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler; import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait; +import org.apache.hadoop.hbase.master.procedure.ServerCrashException; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureEvent; @@ -78,6 +80,7 @@ import org.apache.hadoop.hbase.procedure2.util.StringUtils; import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; @@ -1322,7 +1325,7 @@ public int getNumRegionsOpened() { } public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { - boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName); + boolean carryingMeta = isCarryingMeta(serverName); ProcedureExecutor procExec = this.master.getMasterProcedureExecutor(); procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName, shouldSplitWal, carryingMeta)); @@ -1853,4 +1856,31 @@ public void killRegionServer(final ServerStateNode serverNode) { }*/ master.getServerManager().expireServer(serverNode.getServerName()); } + + /** + * Handle RIT of meta region against crashed server + * Only used when ServerCrashProcedure is not enabled. + * + * @param serverName Server that has already crashed + */ + public void handleMetaRITOnCrashedServer(ServerName serverName) { + RegionInfo hri = RegionReplicaUtil + .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO, + RegionInfo.DEFAULT_REPLICA_ID); + RegionState regionStateNode = getRegionStates().getRegionState(hri); + if (!regionStateNode.getServerName().equals(serverName)) { + return; + } + // meta has been assigned to crashed server. + LOG.info("Meta has been assigned to crashed server: " + serverName + "; will do re-assign"); + // handle failure and wake event + RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri); + // Not need to consider for REGION_TRANSITION_QUEUE step + if (rtp != null && rtp.isMeta() + && rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) { + LOG.info("Re-do rit procedure: " + rtp.toString()); + rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName, + new ServerCrashException(rtp.getProcId(), serverName)); + } + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java index 6a3d5c76b09a..413abe39ad30 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java @@ -212,9 +212,14 @@ public TableDescriptors getTableDescriptors() { return null; } + private boolean serverCrashProcessingEnabled = true; + + public void setServerCrashProcessingEnabled(boolean b) { + serverCrashProcessingEnabled = b; + } @Override public boolean isServerCrashProcessingEnabled() { - return true; + return serverCrashProcessingEnabled; } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java index 6a75729b6ed6..764aa4a18509 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.SortedSet; @@ -173,6 +174,30 @@ public void start(final int numServes, final RSProcedureDispatcher remoteDispatc this.procedureExecutor.getEnvironment().setEventReady(initialized, true); } + /** + * Call this restart method only after running MockMasterServices#start() + * The RSs can be differentiated by the port number, see + * ServerName in MockMasterServices#start() method above. + * Restart of region server will have new startcode in server name + * + * @param serverName Server name to be restarted + */ + public void restartRegionServer(ServerName serverName) throws IOException { + List onlineServers = serverManager.getOnlineServersList(); + long startCode = -1; + for (ServerName s : onlineServers) { + if (s.getAddress().equals(serverName.getAddress())) { + startCode = s.getStartcode() + 1; + break; + } + } + if (startCode == -1) { + return; + } + ServerName sn = ServerName.valueOf(serverName.getAddress().toString(), startCode); + serverManager.regionServerReport(sn, ServerLoad.EMPTY_SERVERLOAD); + } + @Override public void stop(String why) { stopProcedureExecutor(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java index 1912d1168d7d..c328c711a6f1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java @@ -444,6 +444,34 @@ public void testUnassignAnUnassignedRegion() throws Exception { assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount()); } + /** + * It is possible that when AM send assign meta request to a RS successfully, + * but RS can not send back any response, which cause master startup hangs forever + */ + @Test + public void testAssignMetaAndCrashBeforeResponse() throws Exception { + tearDown(); + // See setUp(), start HBase until set up meta + UTIL = new HBaseTestingUtility(); + this.executor = Executors.newSingleThreadScheduledExecutor(); + setupConfiguration(UTIL.getConfiguration()); + master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers); + rsDispatcher = new MockRSProcedureDispatcher(master); + master.start(NSERVERS, rsDispatcher); + am = master.getAssignmentManager(); + + // Assign meta + master.setServerCrashProcessingEnabled(false); + rsDispatcher.setMockRsExecutor(new HangThenRSRestartExecutor()); + am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO); + assertEquals(true, am.isMetaInitialized()); + + // set it back as default, see setUpMeta() + master.setServerCrashProcessingEnabled(true); + am.wakeMetaLoadedEvent(); + am.setFailoverCleanupDone(true); + } + private Future submitProcedure(final Procedure proc) { return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc); } @@ -527,6 +555,14 @@ private void doCrash(final ServerName serverName) { this.am.submitServerCrash(serverName, false/*No WALs here*/); } + private void doRestart(final ServerName serverName) { + try { + this.master.restartRegionServer(serverName); + } catch (IOException e) { + LOG.warn("Can not restart RS with new startcode"); + } + } + private class NoopRsExecutor implements MockRSExecutor { public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest request) throws IOException { @@ -678,6 +714,37 @@ public void run() { } } + /** + * Takes open request and then returns nothing so acts like a RS that went zombie. + * No response (so proc is stuck/suspended on the Master and won't wake up.). + * Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create + * ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor + * will restart RS directly, situation for RS crashed when SCP is not enabled. + */ + private class HangThenRSRestartExecutor extends GoodRsExecutor { + private int invocations; + + @Override + protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq) + throws IOException { + if (this.invocations++ > 0) { + // Return w/o problem the second time through here. + return super.execOpenRegion(server, openReq); + } + // The procedure on master will just hang forever because nothing comes back + // from the RS in this case. + LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout"); + executor.schedule(new Runnable() { + @Override + public void run() { + LOG.info("Restarting RS of " + server); + doRestart(server); + } + }, 1, TimeUnit.SECONDS); + return null; + } + } + private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor { public static final int TYPES_OF_FAILURE = 6; private int invocations;