Skip to content

Commit

Permalink
HBASE-23315 Miscellaneous HBCK Report page cleanup
Browse files Browse the repository at this point in the history
 * Add a bit of javadoc around SerialReplicationChecker.
 * Miniscule edit to the profiler jsp page and then a bit of doc on how to make it work that might help.
 * Add some detail if NPE getting BitSetNode to help w/ debug.
 * Change HbckChore to log region names instead of encoded names; helps doing diagnostics; can take region name and query in shell to find out all about the region according to hbase:meta.
 * Add some fix-it help inline in the HBCK Report page – how to fix.
 * Add counts in procedures page so can see if making progress; move listing of WALs to end of the page.
  • Loading branch information
saintstack committed Nov 19, 2019
1 parent c6ad71e commit 70771b6
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 67 deletions.
Expand Up @@ -137,7 +137,7 @@
* columns: info:merge0001, info:merge0002. You make also see 'mergeA',
* and 'mergeB'. This is old form replaced by the new format that allows
* for more than two parents to be merged at a time.
* TODO: Add rep_barrier for serial replication explaination.
* TODO: Add rep_barrier for serial replication explaination. See SerialReplicationChecker.
* </pre>
* </p>
* <p>
Expand Down Expand Up @@ -608,6 +608,7 @@ private static Scan getMetaScan(Connection connection, int rowUpperLimit) {
* @param excludeOfflinedSplitParents don't return split parents
* @return Return list of regioninfos and server addresses.
*/
// What happens here when 1M regions in hbase:meta? This won't scale?
public static List<Pair<RegionInfo, ServerName>> getTableRegionsAndLocations(
Connection connection, @Nullable final TableName tableName,
final boolean excludeOfflinedSplitParents) throws IOException {
Expand Down Expand Up @@ -1988,6 +1989,9 @@ public static Put makePutForReplicationBarrier(RegionInfo regionInfo, long openS
return put;
}

/**
* See class comment on SerialReplicationChecker
*/
public static void addReplicationBarrier(Put put, long openSeqNum) throws IOException {
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
.setRow(put.getRow())
Expand Down
Expand Up @@ -270,7 +270,7 @@ protected void doGet(final HttpServletRequest req, final HttpServletResponse res
resp.getWriter().write(
"Started [" + event.getInternalName() +
"] profiling. This page will automatically redirect to " +
relativeUrl + " after " + duration + " seconds.\n\ncommand:\n" +
relativeUrl + " after " + duration + " seconds.\n\nCommand:\n" +
Joiner.on(" ").join(cmd));

// to avoid auto-refresh by ProfileOutputServlet, refreshDelay can be specified
Expand Down Expand Up @@ -395,4 +395,4 @@ protected void doGet(final HttpServletRequest req, final HttpServletResponse res

}

}
}
Expand Up @@ -407,7 +407,15 @@ void updateState(long procId, boolean isDeleted) {
int wordIndex = bitmapIndex >> ADDRESS_BITS_PER_WORD;
long value = (1L << bitmapIndex);

modified[wordIndex] |= value;
try {
modified[wordIndex] |= value;
} catch (ArrayIndexOutOfBoundsException aioobe) {
// We've gotten a AIOOBE in here; add detail to help debug.
ArrayIndexOutOfBoundsException aioobe2 =
new ArrayIndexOutOfBoundsException("pid=" + procId + ", deleted=" + isDeleted);
aioobe2.initCause(aioobe);
throw aioobe2;
}
if (isDeleted) {
deleted[wordIndex] |= value;
} else {
Expand All @@ -431,4 +439,4 @@ private static long alignUp(final long x) {
private static long alignDown(final long x) {
return x & -BITS_PER_WORD;
}
}
}
Expand Up @@ -190,10 +190,10 @@ private void loadRegionsFromInMemoryState() {
RegionInfo regionInfo = regionState.getRegion();
if (master.getTableStateManager()
.isTableState(regionInfo.getTable(), TableState.State.DISABLED)) {
disabledTableRegions.add(regionInfo.getEncodedName());
disabledTableRegions.add(regionInfo.getRegionNameAsString());
}
if (regionInfo.isSplitParent()) {
splitParentRegions.add(regionInfo.getEncodedName());
splitParentRegions.add(regionInfo.getRegionNameAsString());
}
HbckRegionInfo.MetaEntry metaEntry =
new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
Expand All @@ -212,7 +212,7 @@ private void loadRegionsFromRSReport() {
String encodedRegionName = RegionInfo.encodeRegionName(regionName);
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
if (hri == null) {
orphanRegionsOnRS.put(encodedRegionName, serverName);
orphanRegionsOnRS.put(RegionInfo.getRegionNameAsString(regionName), serverName);
continue;
}
hri.addServer(hri.getMetaEntry(), serverName);
Expand All @@ -223,29 +223,31 @@ private void loadRegionsFromRSReport() {
numRegions, rsReports.size(), orphanRegionsOnFS.size());

for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) {
String encodedRegionName = entry.getKey();
HbckRegionInfo hri = entry.getValue();
ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
if (hri.getDeployedOn().size() == 0) {
if (locationInMeta == null) {
continue;
}
// skip the offline region which belong to disabled table.
if (disabledTableRegions.contains(encodedRegionName)) {
if (disabledTableRegions.contains(hri.getRegionNameAsString())) {
continue;
}
// skip the split parent regions
if (splitParentRegions.contains(encodedRegionName)) {
if (splitParentRegions.contains(hri.getRegionNameAsString())) {
continue;
}
// Master thought this region opened, but no regionserver reported it.
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>()));
inconsistentRegions.put(hri.getRegionNameAsString(),
new Pair<>(locationInMeta, new LinkedList<>()));
} else if (hri.getDeployedOn().size() > 1) {
// More than one regionserver reported opened this region
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
inconsistentRegions.put(hri.getRegionNameAsString(),
new Pair<>(locationInMeta, hri.getDeployedOn()));
} else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
// Master thought this region opened on Server1, but regionserver reported Server2
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
inconsistentRegions.put(hri.getRegionNameAsString(),
new Pair<>(locationInMeta, hri.getDeployedOn()));
}
}
}
Expand Down Expand Up @@ -339,4 +341,4 @@ public long getCheckingStartTimestamp() {
public long getCheckingEndTimestamp() {
return this.checkingEndTimestamp;
}
}
}
Expand Up @@ -50,12 +50,13 @@
* </p>
* <p>
* We record all the open sequence number for a region in a special family in meta, which is called
* 'barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call [bn,
* bn+1) a range, and it is obvious that a region will always be on the same RS within a range.
* 'rep_barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call
* [bn, bn+1) a range, and it is obvious that a region will always be on the same RS within a
* range.
* <p>
* When split and merge, we will also record the parent for the generated region(s) in the special
* family in meta. And also, we will write an extra 'open sequence number' for the parent region(s),
* which is the max sequence id of the region plus one.
* family in meta. And also, we will write an extra 'open sequence number' for the parent
* region(s), which is the max sequence id of the region plus one.
* </p>
* </p>
* <p>
Expand Down
Expand Up @@ -277,13 +277,15 @@ public static class RegionStdOutSink extends StdOutSink {

public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
incReadFailureCount();
LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
LOG.error("Read from {} on serverName={} failed",
region.getRegionNameAsString(), serverName, e);
}

public void publishReadFailure(ServerName serverName, RegionInfo region,
ColumnFamilyDescriptor column, Exception e) {
incReadFailureCount();
LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
LOG.error("Read from {} on serverName={}, columnFamily={} failed",
region.getRegionNameAsString(), serverName,
column.getNameAsString(), e);
}

Expand Down
14 changes: 11 additions & 3 deletions hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
Expand Up @@ -78,7 +78,7 @@

<div class="row">
<div class="page-header">
<p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are transitory as regions migrate.</span></p>
<p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are <em>transitory</em> as regions migrate.</span></p>
</div>
</div>
<div class="row">
Expand Down Expand Up @@ -119,7 +119,7 @@

<table class="table table-striped">
<tr>
<th>Region Encoded Name</th>
<th>Region Name</th>
<th>Location in META</th>
<th>Reported Online RegionServers</th>
</tr>
Expand All @@ -142,10 +142,18 @@
<h2>Orphan Regions on RegionServer</h2>
</div>
</div>
<p>
<span>
The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
First make sure hbase:meta is in healthy state; run 'hbkc2 fixMeta' to be sure. Once this is done, per Region below, run a bulk
load -- '$ hbase completebulkload REGION_DIR_PATH TABLE_NAME' -- and then delete the desiccated directory content (HFiles are removed upon successful load; all that is left are empty directories
and occasionally a seqid marking file).
</span>
</p>

<table class="table table-striped">
<tr>
<th>Region Encoded Name</th>
<th>Region Name</th>
<th>Reported Online RegionServer</th>
</tr>
<% for (Map.Entry<String, ServerName> entry : orphanRegionsOnRS.entrySet()) { %>
Expand Down
98 changes: 58 additions & 40 deletions hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp
Expand Up @@ -81,11 +81,14 @@
<th>Errors</th>
<th>Parameters</th>
</tr>
<% for (Procedure<?> proc : procedures) {
<%
int displayCount = 0;
for (Procedure<?> proc : procedures) {
// Don't show SUCCESS procedures.
if (proc.isSuccess()) {
continue;
}
displayCount++;
%>
<tr>
<td><%= proc.getProcId() %></td>
Expand All @@ -99,9 +102,63 @@
<td><%= escapeXml(proc.toString()) %></td>
</tr>
<% } %>
<%
if (displayCount > 0) {
%>
<p><%= displayCount %> procedure(s).</p>
<%
}
%>
</table>
</div>
<br />
<div class="container-fluid content">
<div class="row">
<div class="page-header">
<h1>Locks</h1>
</div>
</div>
<%
if (lockedResources.size() > 0) {
%>
<p><%= lockedResources.size() %> lock(s).</p>
<%
}
%>
<% for (LockedResource lockedResource : lockedResources) { %>
<h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
<%
switch (lockedResource.getLockType()) {
case EXCLUSIVE:
%>
<p>Lock type: EXCLUSIVE</p>
<p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
<%
break;
case SHARED:
%>
<p>Lock type: SHARED</p>
<p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
<%
break;
}
List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
if (!waitingProcedures.isEmpty()) {
%>
<h3>Waiting procedures</h3>
<table class="table table-striped" width="90%" >
<% for (Procedure<?> proc : procedures) { %>
<tr>
<td><%= escapeXml(proc.toStringDetails()) %></td>
</tr>
<% } %>
</table>
<% } %>
<% } %>
</div>
<br />
<div class="container-fluid content">
<div class="row">
<div class="page-header">
Expand Down Expand Up @@ -206,44 +263,5 @@
</div>
</div>
<br />
<div class="container-fluid content">
<div class="row">
<div class="page-header">
<h1>Locks</h1>
</div>
</div>
<% for (LockedResource lockedResource : lockedResources) { %>
<h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
<%
switch (lockedResource.getLockType()) {
case EXCLUSIVE:
%>
<p>Lock type: EXCLUSIVE</p>
<p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
<%
break;
case SHARED:
%>
<p>Lock type: SHARED</p>
<p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
<%
break;
}
List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
if (!waitingProcedures.isEmpty()) {
%>
<h3>Waiting procedures</h3>
<table class="table table-striped" width="90%" >
<% for (Procedure<?> proc : procedures) { %>
<tr>
<td><%= escapeXml(proc.toStringDetails()) %></td>
</tr>
<% } %>
</table>
<% } %>
<% } %>
</div>

<jsp:include page="footer.jsp" />
Expand Up @@ -69,7 +69,7 @@ public void setUp() throws Exception {
@Test
public void testForMeta() {
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getEncodedName();
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size());

Expand All @@ -96,7 +96,7 @@ public void testForMeta() {
public void testForUserTable() throws Exception {
TableName tableName = TableName.valueOf("testForUserTable");
RegionInfo hri = createRegionInfo(tableName, 1);
String regionName = hri.getEncodedName();
String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future);
Expand Down Expand Up @@ -154,7 +154,7 @@ public void testForUserTable() throws Exception {
public void testForDisabledTable() throws Exception {
TableName tableName = TableName.valueOf("testForDisabledTable");
RegionInfo hri = createRegionInfo(tableName, 1);
String regionName = hri.getEncodedName();
String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future);
Expand Down

0 comments on commit 70771b6

Please sign in to comment.