Skip to content

Commit

Permalink
HBASE-24075: Fix a race between master shutdown and metrics (re)init
Browse files Browse the repository at this point in the history
JMXCacheBuster resets the metrics state at various points in time. These
events can potentially race with a master shutdown. When the master is
tearing down, metrics initialization can touch a lot of unsafe state,
for example invalidated FS objects. To avoid this, this patch makes
the getMetrics() a no-op when the master is either stopped or in the
process of shutting down. Additionally, getClusterId() when the server
is shutting down is made a no-op.

Simulating a test for this is a bit tricky but with the patch I don't
locally see the long stacktraces from the jira.

Signed-off-by: Michael Stack <stack@apache.org>
  • Loading branch information
bharathv committed Mar 31, 2020
1 parent e5a8435 commit 6f213e9
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
@InterfaceAudience.Private
public interface MetricsMasterWrapper {

/**
* Returns if the master is currently running and is not attempting to shutdown.
*/
boolean isRunning();

/**
* Get ServerName
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ public void getMetrics(MetricsCollector metricsCollector, boolean all) {
MetricsRecordBuilder metricsRecordBuilder = metricsCollector.addRecord(metricsName);

// masterWrapper can be null because this function is called inside of init.
if (masterWrapper != null) {
// If the master is already stopped or has initiated a shutdown, no point in registering the
// metrics again.
if (masterWrapper != null && masterWrapper.isRunning()) {

// Pair<online region number, offline region number>
PairOfSameType<Integer> regionNumberPair = masterWrapper.getRegionCounts();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.ClusterId;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
Expand All @@ -46,8 +47,8 @@ public class CachedClusterId {
public static final Logger LOG = LoggerFactory.getLogger(CachedClusterId.class);
private static final int MAX_FETCH_TIMEOUT_MS = 10000;

private Path rootDir;
private FileSystem fs;
private final Path rootDir;
private final FileSystem fs;

// When true, indicates that a FileSystem fetch of ClusterID is in progress. This is used to
// avoid multiple fetches from FS and let only one thread fetch the information.
Expand All @@ -58,12 +59,15 @@ public class CachedClusterId {
// Immutable once set and read multiple times.
private ClusterId clusterId;

private final Server server;

// cache stats for testing.
private AtomicInteger cacheMisses = new AtomicInteger(0);

public CachedClusterId(Configuration conf) throws IOException {
rootDir = FSUtils.getRootDir(conf);
fs = rootDir.getFileSystem(conf);
public CachedClusterId(Server server, Configuration conf) throws IOException {
this.rootDir = FSUtils.getRootDir(conf);
this.fs = rootDir.getFileSystem(conf);
this.server = server;
}

/**
Expand Down Expand Up @@ -130,9 +134,12 @@ private void waitForFetchToFinish() throws InterruptedException {
* trying get from a clean cache.
*
* @return ClusterId by reading from FileSystem or null in any error case or cluster ID does
* not exist on the file system.
* not exist on the file system or if the server initiated a tear down.
*/
public String getFromCacheOrFetch() {
if (server.isStopping() || server.isStopped()) {
return null;
}
String id = getClusterId();
if (id != null) {
return id;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ public HMaster(final Configuration conf) throws IOException {
this.metaRegionLocationCache = null;
this.activeMasterManager = null;
}
cachedClusterId = new CachedClusterId(conf);
cachedClusterId = new CachedClusterId(this, conf);
} catch (Throwable t) {
// Make sure we log the exception. HMaster is often started via reflection and the
// cause of failed startup is lost.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ public int getNumDeadRegionServers() {
return serverManager.getDeadServers().size();
}

@Override public boolean isRunning() {
return !(master.isStopped() || master.isStopping());
}

@Override
public String getServerName() {
ServerName serverName = master.getServerName();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ public void testClusterIdMatch() {
@Test
public void testMultiThreadedGetClusterId() throws Exception {
Configuration conf = TEST_UTIL.getConfiguration();
CachedClusterId cachedClusterId = new CachedClusterId(conf);
CachedClusterId cachedClusterId = new CachedClusterId(TEST_UTIL.getHBaseCluster().getMaster(),
conf);
TestContext context = new TestContext(conf);
int numThreads = 16;
for (int i = 0; i < numThreads; i++) {
Expand Down

0 comments on commit 6f213e9

Please sign in to comment.