From ea227e64fb8f8240d1dbc8e92056440b3b3e1a68 Mon Sep 17 00:00:00 2001 From: stack Date: Wed, 13 May 2020 22:19:25 -0700 Subject: [PATCH] HBASE-24368 Let HBCKSCP clear 'Unknown Servers', even if RegionStateNode has RegionLocation == null hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java Edit a log. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java Add override of isMatchingRegionLocation. Allow 'null' as a pass in HBCKSCP. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java Add a method for HBCKSCP to override and be less strict filtering assigns. hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp Some doc on what 'Unknown Servers' are. --- .../GCMultipleMergedRegionsProcedure.java | 5 ++-- .../procedure/HBCKServerCrashProcedure.java | 13 ++++++++++ .../procedure/ServerCrashProcedure.java | 26 ++++++++++++++----- .../resources/hbase-webapps/master/hbck.jsp | 11 ++++++++ 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java index 4fc5484aca85..71fcd3544c3e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java @@ -99,12 +99,11 @@ protected Flow executeFromState(MasterProcedureEnv env, GCMergedRegionsState sta case GC_MERGED_REGIONS_PREPARE: // If GCMultipleMergedRegionsProcedure processing is slower than the CatalogJanitor's scan // interval, it will end resubmitting GCMultipleMergedRegionsProcedure for the same - // region, we can skip duplicate GCMultipleMergedRegionsProcedure while previous finished + // region. We can skip duplicate GCMultipleMergedRegionsProcedure while previous finished List parents = MetaTableAccessor.getMergeRegions( env.getMasterServices().getConnection(), mergedChild.getRegionName()); if (parents == null || parents.isEmpty()) { - LOG.info("Region=" + mergedChild.getShortNameToLog() - + " info:merge qualifier has been deleted"); + LOG.info("{} mergeXXX qualifiers have ALL been deleted", mergedChild.getShortNameToLog()); return Flow.NO_MORE_STATE; } setNextState(GCMergedRegionsState.GC_MERGED_REGIONS_PURGE); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java index eec820cc150a..a12b853e9197 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -168,4 +169,16 @@ private List getReassigns() { return this.reassigns; } } + + /** + * The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail, + * the RegionStateNode regionLocation is set to null. This is 'looser' than the test done + * in the superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the + * behest of a report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation + * succeed even in case where the region location in the RegionStateNode is null. + */ + @Override + protected boolean isMatchingRegionLocation(RegionStateNode rsn) { + return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 178343feecd9..076c2668d2ef 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -450,6 +450,15 @@ protected boolean shouldWaitClientAck(MasterProcedureEnv env) { return false; } + /** + * Moved out here so can be overridden by the HBCK fix-up SCP to be less strict about what + * it will tolerate as a 'match'. + * @return True if the region location in rsn matches that of this crashed server. + */ + protected boolean isMatchingRegionLocation(RegionStateNode rsn) { + return this.serverName.equals(rsn.getRegionLocation()); + } + /** * Assign the regions on the crashed RS to other Rses. *

@@ -467,14 +476,17 @@ private void assignRegions(MasterProcedureEnv env, List regions) thr regionNode.lock(); try { // This is possible, as when a server is dead, TRSP will fail to schedule a RemoteProcedure - // to us and then try to assign the region to a new RS. And before it has updated the region + // and then try to assign the region to a new RS. And before it has updated the region // location to the new RS, we may have already called the am.getRegionsOnServer so we will - // consider the region is still on us. And then before we arrive here, the TRSP could have - // updated the region location, or even finished itself, so the region is no longer on us - // any more, we should not try to assign it again. Please see HBASE-23594 for more details. - if (!serverName.equals(regionNode.getRegionLocation())) { - LOG.info("{} found a region {} which is no longer on us {}, give up assigning...", this, - regionNode, serverName); + // consider the region is still on this crashed server. Then before we arrive here, the + // TRSP could have updated the region location, or even finished itself, so the region is + // no longer on this crashed server any more. We should not try to assign it again. Please + // see HBASE-23594 for more details. + // UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get + // in the way of our clearing out 'Unknown Servers'. + if (!isMatchingRegionLocation(regionNode)) { + LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...", + this, regionNode, serverName); continue; } if (regionNode.getProcedure() != null) { diff --git a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp index d90827c40184..9d391b743721 100644 --- a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp +++ b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp @@ -259,6 +259,17 @@

Unknown Servers

+

+ The below are servers mentioned in the hbase:meta table that are not known to the cluster either as 'live' or 'dead'. + The server likely belongs to an older epoch and we no longer have accounting. To clear, run + 'hbck2 scheduleRecoveries UNKNOWN_SERVERNAME' to schedule a ServerCrashProcedure to clear out references + and to schedule reassigns of any hosted Regions. But first, be sure the referenced Region is not currently + stuck looping trying to open. Does it show as a Region-In-Transition on the Master home page? Is it mentioned + in the 'Procedures and Locks' Procedures list? If so, perhaps it stuck in a loop trying to open but unable to + because of a missing reference of file. Read the Master log looking for the most recent mentions of the associated + Region name. + +

RegionInfo