From 030e833dc9a8eac9f69b6c77c2dc8c73d64871be Mon Sep 17 00:00:00 2001 From: stack Date: Fri, 27 Mar 2020 09:36:05 -0700 Subject: [PATCH] HBASE-24052 Add debug+fix to TestMasterShutdown Add check for stopped server at a few more points in Master startup. Defend against NPE in RSProcedureDispatcher; log and retun instead. --- .../apache/hadoop/hbase/master/HMaster.java | 1 + .../procedure/RSProcedureDispatcher.java | 21 ++++++- .../hbase/regionserver/HRegionServer.java | 39 +++++++------ .../hbase/master/TestMasterShutdown.java | 58 ++++--------------- 4 files changed, 53 insertions(+), 66 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index a48c90626771..f244e76fcd40 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -2793,6 +2793,7 @@ public String[] getMasterCoprocessors() { @Override public void abort(String reason, Throwable cause) { if (isAborted() || isStopped()) { + LOG.debug("Abort called but aborted={}, stopped={}", isAborted(), isStopped()); return; } setAbortRequested(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index e72d60c5b061..0a5450ba4b7a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.ServerListener; import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher; import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException; import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException; @@ -95,10 +96,24 @@ public boolean start() { return false; } // Around startup, if failed, some of the below may be set back to null so NPE is possible. + ServerManager sm = master.getServerManager(); + if (sm == null) { + LOG.debug("ServerManager is null; stopping={}", master.isStopping()); + return false; + } + sm.registerListener(this); + ProcedureExecutor pe = master.getMasterProcedureExecutor(); + if (pe == null) { + LOG.debug("ProcedureExecutor is null; stopping={}", master.isStopping()); + return false; + } + procedureEnv = pe.getEnvironment(); + if (this.procedureEnv == null) { + LOG.debug("ProcedureEnv is null; stopping={}", master.isStopping()); + return false; + } try { - master.getServerManager().registerListener(this); - procedureEnv = master.getMasterProcedureExecutor().getEnvironment(); - for (ServerName serverName : master.getServerManager().getOnlineServersList()) { + for (ServerName serverName : sm.getOnlineServersList()) { addNode(serverName); } } catch (Exception e) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 2a9ca86e9e98..e9e6d7d2a426 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -942,6 +942,10 @@ public boolean isClusterUp() { */ @Override public void run() { + if (isStopped()) { + LOG.info("Skipping run; stopped"); + return; + } try { // Do pre-registration initializations; zookeeper, lease threads, etc. preRegistrationInitialization(); @@ -955,24 +959,25 @@ public void run() { // Initialize the RegionServerCoprocessorHost now that our ephemeral // node was created, in case any coprocessors want to use ZooKeeper this.rsHost = new RegionServerCoprocessorHost(this, this.conf); - } - // Try and register with the Master; tell it we are here. Break if server is stopped or the - // clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start - // up all Services. Use RetryCounter to get backoff in case Master is struggling to come up. - LOG.debug("About to register with Master."); - RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, - this.sleeper.getPeriod(), 1000 * 60 * 5); - RetryCounter rc = rcf.create(); - while (keepLooping()) { - RegionServerStartupResponse w = reportForDuty(); - if (w == null) { - long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); - LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); - this.sleeper.sleep(sleepTime); - } else { - handleReportForDutyResponse(w); - break; + // Try and register with the Master; tell it we are here. Break if server is stopped or + // the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and + // start up all Services. Use RetryCounter to get backoff in case Master is struggling to + // come up. + LOG.debug("About to register with Master."); + RetryCounterFactory rcf = + new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5); + RetryCounter rc = rcf.create(); + while (keepLooping()) { + RegionServerStartupResponse w = reportForDuty(); + if (w == null) { + long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); + LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); + this.sleeper.sleep(sleepTime); + } else { + handleReportForDutyResponse(w); + break; + } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java index a5e596f79d7c..7b3921e3ecbd 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java @@ -21,10 +21,7 @@ import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import java.io.IOException; -import java.time.Duration; import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.CompletionException; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClusterMetrics; @@ -35,8 +32,6 @@ import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.StartMiniClusterOption; import org.apache.hadoop.hbase.Waiter; -import org.apache.hadoop.hbase.client.AsyncConnection; -import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; @@ -156,48 +151,19 @@ public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception { hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(), options.getNumRegionServers(), options.getMasterClass(), options.getRsClass()); final MasterThread masterThread = hbaseCluster.getMasters().get(0); - - final CompletableFuture shutdownFuture = CompletableFuture.runAsync(() -> { - // Switching to master registry exacerbated a race in the master bootstrap that can result - // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because - // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is - // made to the master. The suspected reason as to why it was uncommon before HBASE-18095 - // is because the connection creation with ZK registry is so slow that by then the server - // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit - // wait() in the test, waiting for the server manager to become available. - final long timeout = TimeUnit.MINUTES.toMillis(10); - assertNotEquals("timeout waiting for server manager to become available.", - -1, Waiter.waitFor(htu.getConfiguration(), timeout, - () -> masterThread.getMaster().getServerManager() != null)); - - // Master has come up far enough that we can terminate it without creating a zombie. - final long result = Waiter.waitFor(htu.getConfiguration(), timeout, 500, () -> { - final Configuration conf = createResponsiveZkConfig(htu.getConfiguration()); - LOG.debug("Attempting to establish connection."); - final CompletableFuture connFuture = - ConnectionFactory.createAsyncConnection(conf); - try (final AsyncConnection conn = connFuture.join()) { - LOG.debug("Sending shutdown RPC."); - try { - conn.getAdmin().shutdown().join(); - LOG.debug("Shutdown RPC sent."); - return true; - } catch (CompletionException e) { - LOG.debug("Failure sending shutdown RPC."); - } - } catch (IOException|CompletionException e) { - LOG.debug("Failed to establish connection."); - } catch (Throwable e) { - LOG.info("Something unexpected happened.", e); - } - return false; - }); - assertNotEquals("Failed to issue shutdown RPC after " + Duration.ofMillis(timeout), - -1, result); - }); - masterThread.start(); - shutdownFuture.join(); + // Switching to master registry exacerbated a race in the master bootstrap that can result + // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because + // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is + // made to the master. The suspected reason as to why it was uncommon before HBASE-18095 + // is because the connection creation with ZK registry is so slow that by then the server + // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit + // wait() in the test, waiting for the server manager to become available. + final long timeout = TimeUnit.MINUTES.toMillis(10); + assertNotEquals("Timeout waiting for server manager to become available.", + -1, Waiter.waitFor(htu.getConfiguration(), timeout, + () -> masterThread.getMaster().getServerManager() != null)); + htu.getConnection().getAdmin().shutdown(); masterThread.join(); } finally { if (hbaseCluster != null) {