Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-23596 HBCKServerCrashProcedure can double assign #952

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1448,7 +1448,7 @@ public static void removeRegionReplicasFromMeta(Set<byte[]> metaRows,
}
}

private static void addRegionStateToPut(Put put, RegionState.State state) throws IOException {
private static Put addRegionStateToPut(Put put, RegionState.State state) throws IOException {
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
.setRow(put.getRow())
.setFamily(HConstants.CATALOG_FAMILY)
Expand All @@ -1457,6 +1457,17 @@ private static void addRegionStateToPut(Put put, RegionState.State state) throws
.setType(Cell.Type.Put)
.setValue(Bytes.toBytes(state.name()))
.build());
return put;
}

/**
* Update state column in hbase:meta.
*/
public static void updateRegionState(Connection connection, RegionInfo ri,
RegionState.State state) throws IOException {
Put put = new Put(RegionReplicaUtil.getRegionInfoForDefaultReplica(ri).getRegionName());
MetaTableAccessor.putsToMetaTable(connection,
Collections.singletonList(addRegionStateToPut(put, state)));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ private void visitMetaEntry(final RegionStateVisitor visitor, final Result resul
if (regionInfo == null) continue;

final int replicaId = regionInfo.getReplicaId();
final State state = getRegionState(result, replicaId, regionInfo);
final State state = getRegionState(result, regionInfo);

final ServerName lastHost = hrl.getServerName();
final ServerName regionLocation = getRegionServer(result, replicaId);
Expand Down Expand Up @@ -326,12 +326,11 @@ private static byte[] getServerNameColumn(int replicaId) {

/**
* Pull the region state from a catalog table {@link Result}.
* @param r Result to pull the region state from
* @return the region state, or null if unknown.
*/
@VisibleForTesting
public static State getRegionState(final Result r, int replicaId, RegionInfo regionInfo) {
Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId));
public static State getRegionState(final Result r, RegionInfo regionInfo) {
Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY,
getStateColumn(regionInfo.getReplicaId()));
if (cell == null || cell.getValueLength() == 0) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,31 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.RegionLocations;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* A SCP that differs from default only in how it gets the list of
* Regions hosted on the crashed-server; it also reads hbase:meta directly rather
* than rely solely on Master memory for list of Regions that were on crashed server.
* This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
* scheduleRecoveries). It is for the case where meta has references to 'Unknown Servers',
* Acts like the super class in all cases except when no Regions found in the
* current Master in-memory context. In this latter case, when the call to
* super#getRegionsOnCrashedServer returns nothing, this SCP will scan
* hbase:meta for references to the passed ServerName. If any found, we'll
* clean them up.
*
* <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
* scheduleRecoveries); the super class is used during normal recovery operations.
* It is for the case where meta has references to 'Unknown Servers',
* servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master
* and hbase:meta content have deviated. It should never happen in normal running
* cluster but if we do drop accounting of servers, we need a means of fix-up.
Expand Down Expand Up @@ -65,31 +75,97 @@ public HBCKServerCrashProcedure(final MasterProcedureEnv env, final ServerName s
public HBCKServerCrashProcedure() {}

/**
* Adds Regions found by super method any found scanning hbase:meta.
* If no Regions found in Master context, then we will search hbase:meta for references
* to the passed server. Operator may have passed ServerName because they have found
* references to 'Unknown Servers'. They are using HBCKSCP to clear them out.
*/
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
justification="FindBugs seems confused on ps in below.")
List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
// Super can return immutable emptyList.
// Super will return an immutable list (empty if nothing on this server).
List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
List<Pair<RegionInfo, ServerName>> ps = null;
if (!ris.isEmpty()) {
return ris;
}
// Nothing in in-master context. Check for Unknown Server! in hbase:meta.
// If super list is empty, then allow that an operator scheduled an SCP because they are trying
// to purge 'Unknown Servers' -- servers that are neither online nor in dead servers
// list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'.
// This mis-accounting does not happen in normal circumstance but may arise in-extremis
// when cluster has been damaged in operation.
UnknownServerVisitor visitor =
new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName());
try {
ps = MetaTableAccessor.getTableRegionsAndLocations(env.getMasterServices().getConnection(),
null, false);
MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(),
visitor, null);
} catch (IOException ioe) {
LOG.warn("Failed get of all regions; continuing", ioe);
}
if (ps == null || ps.isEmpty()) {
LOG.warn("No regions found in hbase:meta");
LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe);
return ris;
}
List<RegionInfo> aggregate = ris == null || ris.isEmpty()?
new ArrayList<>(): new ArrayList<>(ris);
int before = aggregate.size();
ps.stream().filter(p -> p.getSecond() != null && p.getSecond().equals(getServerName())).
forEach(p -> aggregate.add(p.getFirst()));
LOG.info("Found {} mentions of {} in hbase:meta", aggregate.size() - before, getServerName());
return aggregate;
LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}",
visitor.getReassigns().size(), getServerName(),
visitor.getReassigns().stream().map(RegionInfo::getEncodedName).
collect(Collectors.joining(",")));
return visitor.getReassigns();
}

/**
* Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects
* a List of Regions to reassign as 'result'.
*/
private static class UnknownServerVisitor implements MetaTableAccessor.Visitor {
private final List<RegionInfo> reassigns = new ArrayList<>();
private final ServerName unknownServerName;
private final Connection connection;

private UnknownServerVisitor(Connection connection, ServerName unknownServerName) {
this.connection = connection;
this.unknownServerName = unknownServerName;
}

@Override
public boolean visit(Result result) throws IOException {
RegionLocations rls = MetaTableAccessor.getRegionLocations(result);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rls would never be null in our case right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't be but let me handle it; when this code is running the hbase:meta could be messed up.

if (rls == null) {
return true;
}
for (HRegionLocation hrl: rls.getRegionLocations()) {
if (hrl == null) {
continue;
}
if (hrl.getRegion() == null) {
continue;
}
if (hrl.getServerName() == null) {
continue;
}
if (!hrl.getServerName().equals(this.unknownServerName)) {
continue;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above 3 conditions can be combined for continue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As is it is an easier read IMO!

RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion());
RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName());
if (rs.isClosing()) {
// Move region to CLOSED in hbase:meta.
LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta",
hrl.getRegion().getRegionNameAsString());
try {
MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(),
RegionState.State.CLOSED);
} catch (IOException ioe) {
LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe);
}
} else if (rs.isOpening() || rs.isOpened()) {
this.reassigns.add(hrl.getRegion());
} else {
LOG.info("Passing {}", rs);
}
}
return true;
}

private List<RegionInfo> getReassigns() {
return this.reassigns;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3651,8 +3651,7 @@ public boolean evaluate() throws IOException {
return false;
}
}
if (RegionStateStore.getRegionState(r,
info.getReplicaId(), info) != RegionState.State.OPEN) {
if (RegionStateStore.getRegionState(r, info) != RegionState.State.OPEN) {
return false;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ public void test() throws Exception {
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
master.getServerManager().getDeadServers().finish(rsServerName);
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
// Kill the server. Nothing should happen since an 'Unknown Server' as far
// as the Master is concerned; i.e. no SCP.
LOG.info("Killing {}", rsServerName);
Expand Down