Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-22527 [hbck2] Add a master web ui to show the problematic regions #373

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,98 @@ See the License for the specific language governing permissions and
limitations under the License.
</%doc>
<%import>
org.apache.hadoop.hbase.master.assignment.AssignmentManager;
org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
org.apache.hadoop.hbase.master.RegionState;
java.util.Map;
java.util.Set;
java.util.SortedSet;
java.util.concurrent.atomic.AtomicInteger;
java.util.stream.Collectors;
org.apache.hadoop.conf.Configuration;
org.apache.hadoop.hbase.HBaseConfiguration;
org.apache.hadoop.hbase.HConstants;
org.apache.hadoop.hbase.ServerName;
org.apache.hadoop.hbase.client.RegionInfo;
org.apache.hadoop.hbase.client.RegionInfoDisplay;
java.util.HashSet;
java.util.SortedSet;
java.util.Map;
java.util.concurrent.atomic.AtomicInteger;
org.apache.hadoop.hbase.master.RegionState;
org.apache.hadoop.hbase.master.assignment.AssignmentManager;
org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
org.apache.hadoop.hbase.util.Pair;
</%import>
<%args>
AssignmentManager assignmentManager;
int limit = 100;
</%args>

<%java SortedSet<RegionState> rit = assignmentManager
.getRegionStates().getRegionsInTransitionOrderedByTimestamp();
%>
<%java>
SortedSet<RegionState> rit = assignmentManager.getRegionStates()
.getRegionsInTransitionOrderedByTimestamp();
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = assignmentManager
.getProblematicRegions();
</%java>

<%if !problematicRegions.isEmpty() %>
<%java>
int totalSize = problematicRegions.size();
int sizePerPage = Math.min(10, totalSize);
int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage);
</%java>
<section>
<h2><a name="rit">Problematic Regions</a></h2>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On commit, add a sentence that says what a problematic region is. It seems like its one that has a meta entry that does not agree w/ where it is actually deployed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or, you say below what it is... * case 1. Master thought this region opened, but no regionserver reported it.

  • case 2. Master thought this region opened on Server1, but regionserver reported Server2
  • case 3. More than one regionservers reported opened this region

Can this be added in the UI maybe in small text?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already committed it. Will add addendum for this.

<div class="tabbable">
<div class="tab-content">
<%java int recordItr = 0; %>
<%for Map.Entry<String, Pair<ServerName, Set<ServerName>>> entry : problematicRegions.entrySet() %>
<%if (recordItr % sizePerPage) == 0 %>
<%if recordItr == 0 %>
<div class="tab-pane active" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
<%else>
<div class="tab-pane" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
</%if>
<table class="table table-striped" style="margin-bottom:0px;">
<tr>
<th>Region</th>
<th>Location in META</th>
<th>Reported Online Region Servers</th>
</tr>
</%if>

<tr>
<td><% entry.getKey() %></td>
<td><% entry.getValue().getFirst() %></td>
<td><% entry.getValue().getSecond().stream().map(ServerName::getServerName)
.collect(Collectors.joining(", ")) %></td>
</tr>
<%java recordItr++; %>
<%if (recordItr % sizePerPage) == 0 %>
</table>
</div>
</%if>
</%for>

<%if (recordItr % sizePerPage) != 0 %>
<%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %>
<tr><td colspan="3" style="height:61px"></td></tr>
</%for>
</table>
</div>
</%if>

</div>
<nav>
<ul class="nav nav-pills pagination">
<%for int i = 1 ; i <= numOfPages; i++ %>
<%if i == 1 %>
<li class="active">
<%else>
<li>
</%if>
<a href="#tab_prs<% i %>"><% i %></a></li>
</%for>
</ul>
</nav>
</div>
</section>
</%if>

<%if !rit.isEmpty() %>
<%java>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ public class AssignmentManager {
private final RegionStates regionStates = new RegionStates();
private final RegionStateStore regionStateStore;

private final Map<ServerName, Set<byte[]>> rsReports = new HashMap<>();

private final boolean shouldAssignRegionsWithFavoredNodes;
private final int assignDispatchWaitQueueMaxSize;
private final int assignDispatchWaitMillis;
Expand Down Expand Up @@ -1065,13 +1067,18 @@ public void reportOnlineRegions(ServerName serverName, Set<byte[]> regionNames)
}

ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);

synchronized (serverNode) {
if (!serverNode.isInState(ServerState.ONLINE)) {
LOG.warn("Got a report from a server result in state " + serverNode.getState());
return;
}
}

// Track the regionserver reported online regions in memory.
synchronized (rsReports) {
rsReports.put(serverName, regionNames);
}

if (regionNames.isEmpty()) {
// nothing to do if we don't have regions
LOG.trace("no online region found on {}", serverName);
Expand Down Expand Up @@ -2028,4 +2035,53 @@ public List<ServerName> getExcludedServersForSystemTable() {
MasterServices getMaster() {
return master;
}

/**
* Found the potentially problematic opened regions. There are three case:
* case 1. Master thought this region opened, but no regionserver reported it.
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
* case 3. More than one regionservers reported opened this region
*
* @return the map of potentially problematic opened regions. The key is the region name. The
* value is a pair of location in meta and the regionservers which reported opened this region.
*/
public Map<String, Pair<ServerName, Set<ServerName>>> getProblematicRegions() {
Map<String, Set<ServerName>> reportedOnlineRegions = new HashMap<>();
synchronized (rsReports) {
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
for (byte[] regionName : entry.getValue()) {
reportedOnlineRegions
.computeIfAbsent(RegionInfo.getRegionNameAsString(regionName), r -> new HashSet<>())
.add(entry.getKey());
}
}
}

Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = new HashMap<>();
List<RegionState> rits = regionStates.getRegionsStateInTransition();
for (RegionState regionState : regionStates.getRegionStates()) {
// Only consider the opened region and not in transition
if (!rits.contains(regionState) && regionState.isOpened()) {
String regionName = regionState.getRegion().getRegionNameAsString();
ServerName serverName = regionState.getServerName();
if (reportedOnlineRegions.containsKey(regionName)) {
Set<ServerName> reportedServers = reportedOnlineRegions.get(regionName);
if (reportedServers.contains(serverName)) {
if (reportedServers.size() > 1) {
// More than one regionserver reported opened this region
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
}
} else {
// Master thought this region opened on Server1, but regionserver reported Server2
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
}
} else {
// Master thought this region opened, but no regionserver reported it.
problematicRegions.put(regionName, new Pair<>(serverName, new HashSet<>()));
}
}
}

return problematicRegions;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.assignment;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Future;

import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Pair;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Category({ MasterTests.class, MediumTests.class })
public class TestAMProblematicRegions extends TestAssignmentManagerBase {
private static final Logger LOG = LoggerFactory.getLogger(TestAMProblematicRegions.class);

@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestAMProblematicRegions.class);

@Test
public void testForMeta() {
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size());

Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();

// Test for case1: Master thought this region opened, but no regionserver reported it.
assertTrue(problematicRegions.containsKey(metaRegionName));
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(metaRegionName);
ServerName locationInMeta = pair.getFirst();
Set<ServerName> reportedRegionServers = pair.getSecond();
assertTrue(serverNames.contains(locationInMeta));
assertEquals(0, reportedRegionServers.size());

// Reported right region location. Then not in problematic regions.
am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
problematicRegions = am.getProblematicRegions();
assertFalse(problematicRegions.containsKey(metaRegionName));
}

@Test
public void testForUserTable() throws Exception {
TableName tableName = TableName.valueOf("testForUserTable");
RegionInfo hri = createRegionInfo(tableName, 1);
String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future);

List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size());

// Test for case1: Master thought this region opened, but no regionserver reported it.
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
assertTrue(problematicRegions.containsKey(regionName));
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(regionName);
ServerName locationInMeta = pair.getFirst();
Set<ServerName> reportedRegionServers = pair.getSecond();
assertTrue(serverNames.contains(locationInMeta));
assertEquals(0, reportedRegionServers.size());

// Test for case2: Master thought this region opened on Server1, but regionserver reported
// Server2
final ServerName tempLocationInMeta = locationInMeta;
final ServerName anotherServer =
serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
problematicRegions = am.getProblematicRegions();
assertTrue(problematicRegions.containsKey(regionName));
pair = problematicRegions.get(regionName);
locationInMeta = pair.getFirst();
reportedRegionServers = pair.getSecond();
assertEquals(1, reportedRegionServers.size());
assertFalse(reportedRegionServers.contains(locationInMeta));
assertTrue(reportedRegionServers.contains(anotherServer));

// Test for case3: More than one regionservers reported opened this region.
am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
problematicRegions = am.getProblematicRegions();
assertTrue(problematicRegions.containsKey(regionName));
pair = problematicRegions.get(regionName);
locationInMeta = pair.getFirst();
reportedRegionServers = pair.getSecond();
assertEquals(2, reportedRegionServers.size());
assertTrue(reportedRegionServers.contains(locationInMeta));
assertTrue(reportedRegionServers.contains(anotherServer));

// Reported right region location. Then not in problematic regions.
am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
problematicRegions = am.getProblematicRegions();
assertFalse(problematicRegions.containsKey(regionName));
}
}