HBASE-24075: Fix a race between master shutdown and metrics (re)init

JMXCacheBuster resets the metrics state at various points in time. These events can potentially race with a master shutdown. When the master is tearing down, metrics initialization can touch a lot of unsafe state, for example invalidated FS objects. To avoid this, this patch makes the getMetrics() a no-op when the master is either stopped or in the process of shutting down. Additionally, getClusterId() when the server is shutting down is made a no-op. Simulating a test for this is a bit tricky but with the patch I don't locally see the long stacktraces from the jira. Signed-off-by: Michael Stack <stack@apache.org>
apache · Mar 31, 2020 · 6f213e9 · 6f213e9
1 parent e5a8435
commit 6f213e9
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 9 deletions.
diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
@@ -30,6 +30,11 @@
 @InterfaceAudience.Private
 public interface MetricsMasterWrapper {
 
+  /**
+   * Returns if the master is currently running and is not attempting to shutdown.
+   */
+  boolean isRunning();
+
   /**
    * Get ServerName
    */

diff --git a/...-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java b/...-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
@@ -83,7 +83,9 @@ public void getMetrics(MetricsCollector metricsCollector, boolean all) {
     MetricsRecordBuilder metricsRecordBuilder = metricsCollector.addRecord(metricsName);
 
     // masterWrapper can be null because this function is called inside of init.
-    if (masterWrapper != null) {
+    // If the master is already stopped or has initiated a shutdown, no point in registering the
+    // metrics again.
+    if (masterWrapper != null && masterWrapper.isRunning()) {
 
       // Pair<online region number, offline region number>
       PairOfSameType<Integer> regionNumberPair = masterWrapper.getRegionCounts();

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
@@ -25,6 +25,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.ClusterId;
+import org.apache.hadoop.hbase.Server;
 import org.apache.hadoop.hbase.util.FSUtils;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -46,8 +47,8 @@ public class CachedClusterId {
   public static final Logger LOG = LoggerFactory.getLogger(CachedClusterId.class);
   private static final int MAX_FETCH_TIMEOUT_MS = 10000;
 
-  private Path rootDir;
-  private FileSystem fs;
+  private final Path rootDir;
+  private final FileSystem fs;
 
   // When true, indicates that a FileSystem fetch of ClusterID is in progress. This is used to
   // avoid multiple fetches from FS and let only one thread fetch the information.
@@ -58,12 +59,15 @@ public class CachedClusterId {
   // Immutable once set and read multiple times.
   private ClusterId clusterId;
 
+  private final Server server;
+
   // cache stats for testing.
   private AtomicInteger cacheMisses = new AtomicInteger(0);
 
-  public CachedClusterId(Configuration conf) throws IOException {
-    rootDir = FSUtils.getRootDir(conf);
-    fs = rootDir.getFileSystem(conf);
+  public CachedClusterId(Server server, Configuration conf) throws IOException {
+    this.rootDir = FSUtils.getRootDir(conf);
+    this.fs = rootDir.getFileSystem(conf);
+    this.server = server;
   }
 
   /**
@@ -130,9 +134,12 @@ private void waitForFetchToFinish() throws InterruptedException {
    * trying get from a clean cache.
    *
    * @return ClusterId by reading from FileSystem or null in any error case or cluster ID does
-   *     not exist on the file system.
+   *     not exist on the file system or if the server initiated a tear down.
    */
   public String getFromCacheOrFetch() {
+    if (server.isStopping() || server.isStopped()) {
+      return null;
+    }
     String id = getClusterId();
     if (id != null) {
       return id;

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -586,7 +586,7 @@ public HMaster(final Configuration conf) throws IOException {
         this.metaRegionLocationCache = null;
         this.activeMasterManager = null;
       }
-      cachedClusterId = new CachedClusterId(conf);
+      cachedClusterId = new CachedClusterId(this, conf);
     } catch (Throwable t) {
       // Make sure we log the exception. HMaster is often started via reflection and the
       // cause of failed startup is lost.

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
@@ -134,6 +134,10 @@ public int getNumDeadRegionServers() {
     return serverManager.getDeadServers().size();
   }
 
+  @Override public boolean isRunning() {
+    return !(master.isStopped() || master.isStopping());
+  }
+
   @Override
   public String getServerName() {
     ServerName serverName = master.getServerName();

diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
@@ -76,7 +76,8 @@ public void testClusterIdMatch() {
   @Test
   public void testMultiThreadedGetClusterId() throws Exception {
     Configuration conf = TEST_UTIL.getConfiguration();
-    CachedClusterId cachedClusterId = new CachedClusterId(conf);
+    CachedClusterId cachedClusterId = new CachedClusterId(TEST_UTIL.getHBaseCluster().getMaster(),
+      conf);
     TestContext context = new TestContext(conf);
     int numThreads = 16;
     for (int i = 0; i < numThreads; i++) {