apache · petersomogyi · Oct 28, 2020 · May 14, 2020 · saintstack · Oct 27, 2020
diff --git a/...main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java b/...main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java
@@ -79,12 +79,12 @@ protected Flow executeFromState(MasterProcedureEnv env, GCMergedRegionsState sta
         case GC_MERGED_REGIONS_PREPARE:
           // If GCMultipleMergedRegionsProcedure processing is slower than the CatalogJanitor's scan
           // interval, it will end resubmitting GCMultipleMergedRegionsProcedure for the same
-          // region, we can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
+          // region. We can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
           List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(
             env.getMasterServices().getConnection(), mergedChild.getRegionName());
           if (parents == null || parents.isEmpty()) {
-            LOG.info("Region=" + mergedChild.getShortNameToLog()
-                + " info:merge qualifier has been deleted");
+            LOG.info("{} mergeXXX qualifiers have ALL been deleted",
+                    mergedChild.getShortNameToLog());
             return Flow.NO_MORE_STATE;
           }
           setNextState(GCMergedRegionsState.GC_MERGED_REGIONS_PURGE);

diff --git a/...rver/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/...rver/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
@@ -30,6 +30,7 @@
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.master.RegionState;
+import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
 import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -169,4 +170,16 @@ private List<RegionInfo> getReassigns() {
       return this.reassigns;
     }
   }
+
+  /**
+   * The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail,
+   * the RegionStateNode regionLocation is set to null. This is 'looser' than the test done
+   * in the superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the
+   * behest of a report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation
+   * succeed even in case where the region location in the RegionStateNode is null.
+   */
+  @Override
+  protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
+    return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null;
+  }
 }
diff --git a/...e-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/...e-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@@ -451,6 +451,15 @@ protected boolean shouldWaitClientAck(MasterProcedureEnv env) {
     return false;
   }
 
+  /**
+   * Moved out here so can be overridden by the HBCK fix-up SCP to be less strict about what
+   * it will tolerate as a 'match'.
+   * @return True if the region location in <code>rsn</code> matches that of this crashed server.
+   */
+  protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
+    return this.serverName.equals(rsn.getRegionLocation());
+  }
+
   /**
    * Assign the regions on the crashed RS to other Rses.
    * <p/>
@@ -468,20 +477,23 @@ private void assignRegions(MasterProcedureEnv env, List<RegionInfo> regions) thr
       regionNode.lock();
       try {
         // This is possible, as when a server is dead, TRSP will fail to schedule a RemoteProcedure
-        // to us and then try to assign the region to a new RS. And before it has updated the region
+        // and then try to assign the region to a new RS. And before it has updated the region
         // location to the new RS, we may have already called the am.getRegionsOnServer so we will
-        // consider the region is still on us. And then before we arrive here, the TRSP could have
-        // updated the region location, or even finished itself, so the region is no longer on us
-        // any more, we should not try to assign it again. Please see HBASE-23594 for more details.
-        if (!serverName.equals(regionNode.getRegionLocation())) {
+        // consider the region is still on this crashed server. Then before we arrive here, the
+        // TRSP could have updated the region location, or even finished itself, so the region is
+        // no longer on this crashed server any more. We should not try to assign it again. Please
+        // see HBASE-23594 for more details.
+        // UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get
+        // in the way of our clearing out 'Unknown Servers'.
+        if (!isMatchingRegionLocation(regionNode)) {
           // See HBASE-24117, though we have already changed the shutdown order, it is still worth
           // double checking here to confirm that we do not skip assignment incorrectly.
           if (!am.isRunning()) {
             throw new DoNotRetryIOException(
-              "AssignmentManager has been stopped, can not process assignment any more");
+                    "AssignmentManager has been stopped, can not process assignment any more");
           }
-          LOG.info("{} found a region {} which is no longer on us {}, give up assigning...", this,
-            regionNode, serverName);
+          LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...",
+                  this, regionNode, serverName);
           continue;
         }
         if (regionNode.getProcedure() != null) {

diff --git a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
@@ -112,8 +112,7 @@
         need to check the server still exists. If not, schedule <em>ServerCrashProcedure</em> for it. If exists,
         restart Server2 and Server1):
         3. More than one regionserver reports opened this region (Fix: restart the RegionServers).
-        Notice: the reported online regionservers may be not right when there are regions in transition.
-        Please check them in regionserver's web UI.
+        Note: the reported online regionservers may be not be up-to-date when there are regions in transition.
         </span>
       </p>
 
@@ -165,8 +164,9 @@
   </div>
       <p>
         <span>
-          The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
-          First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may complain);
+          The below are Regions we've lost account of. To be safe, run bulk load of any data found under these Region orphan directories to have the
+          cluster re-adopt data.
+          First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may fail);
           run <em>hbck2 fixMeta</em>. Once this is done, per Region below, run a bulk
           load -- <em>$ hbase completebulkload REGION_DIR_PATH TABLE_NAME</em> -- and then delete the desiccated directory content (HFiles are removed upon
           successful load; all that is left are empty directories and occasionally a seqid marking file).
@@ -273,6 +273,21 @@
                 <h2>Unknown Servers</h2>
               </div>
             </div>
+            <p>
+              <span>The below are servers mentioned in the hbase:meta table that are no longer 'live' or known 'dead'.
+                The server likely belongs to an older cluster epoch since replaced by a new instance because of a restart/crash.
+                To clear 'Unknown Servers', run 'hbck2 scheduleRecoveries UNKNOWN_SERVERNAME'. This will schedule a ServerCrashProcedure.
+                It will clear out 'Unknown Server' references and schedule reassigns of any Regions that were associated with this host.
+                But first!, be sure the referenced Region is not currently stuck looping trying to OPEN. Does it show as a Region-In-Transition on the
+                Master home page? Is it mentioned in the 'Procedures and Locks' Procedures list? If so, perhaps it stuck in a loop
+                trying to OPEN but unable to because of a missing reference or file.
+                Read the Master log looking for the most recent
+                mentions of the associated Region name. Try and address any such complaint first. If successful, a side-effect
+                should be the clean up of the 'Unknown Servers' list. It may take a while. OPENs are retried forever but the interval
+                between retries grows. The 'Unknown Server' may be cleared because it is just the last RegionServer the Region was
+                successfully opened on; on the next open, the 'Unknown Server' will be purged.
+              </span>
+            </p>
             <table class="table table-striped">
               <tr>
                 <th>RegionInfo</th>