Skip to content

Commit

Permalink
HBASE-28241 The snapshot operation encountered an NPE and failed. (#5560
Browse files Browse the repository at this point in the history
)

Fixed the check for an ongoing Snapshot before proceeding with the merge/split region operation.

Co-authored-by: lvhaiping.lhp <lvhaiping.lhp@alibaba-inc.com>
Signed-off-by: Duo Zhang <zhangduo@apache.org>
Signed-off-by: Hui Ruan <huiruan@apache.org>
(cherry picked from commit 3d11712)
  • Loading branch information
hiping-tech authored and Apache9 committed Dec 12, 2023
1 parent dd6fd73 commit 1d7ea88
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
private boolean prepareMergeRegion(final MasterProcedureEnv env) throws IOException {
// Fail if we are taking snapshot for the given table
TableName tn = regionsToMerge[0].getTable();
if (env.getMasterServices().getSnapshotManager().isTakingSnapshot(tn)) {
if (env.getMasterServices().getSnapshotManager().isTableTakingAnySnapshot(tn)) {
throw new MergeRegionException("Skip merging regions "
+ RegionInfo.getShortNameToLog(regionsToMerge) + ", because we are snapshotting " + tn);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,8 @@ private byte[] getSplitRow() {
public boolean prepareSplitRegion(final MasterProcedureEnv env) throws IOException {
// Fail if we are taking snapshot for the given table
if (
env.getMasterServices().getSnapshotManager().isTakingSnapshot(getParentRegion().getTable())
env.getMasterServices().getSnapshotManager()
.isTableTakingAnySnapshot(getParentRegion().getTable())
) {
setFailure(new IOException("Skip splitting region " + getParentRegion().getShortNameToLog()
+ ", because we are taking snapshot for the table " + getParentRegion().getTable()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.master.assignment;

import static org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility.assertProcFailed;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

Expand All @@ -33,16 +34,20 @@
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.SnapshotDescription;
import org.apache.hadoop.hbase.client.SnapshotType;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
import org.apache.hadoop.hbase.master.procedure.TestSnapshotProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.Bytes;
Expand All @@ -59,6 +64,9 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;

@Category({ MasterTests.class, LargeTests.class })
public class TestMergeTableRegionsProcedure {

Expand Down Expand Up @@ -345,6 +353,42 @@ public void testMergeWithoutPONR() throws Exception {
assertRegionCount(tableName, initialRegionCount - 1);
}

@Test
public void testMergingRegionWhileTakingSnapshot() throws Exception {
final TableName tableName = TableName.valueOf("testMergingRegionWhileTakingSnapshot");
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();

List<RegionInfo> tableRegions = createTable(tableName);

ProcedureTestingUtility.waitNoProcedureRunning(procExec);

SnapshotDescription snapshot =
new SnapshotDescription("SnapshotProcedureTest", tableName, SnapshotType.FLUSH);
SnapshotProtos.SnapshotDescription snapshotProto =
ProtobufUtil.createHBaseProtosSnapshotDesc(snapshot);
snapshotProto = SnapshotDescriptionUtils.validate(snapshotProto,
UTIL.getHBaseCluster().getMaster().getConfiguration());
long snapshotProcId = procExec.submitProcedure(
new TestSnapshotProcedure.DelaySnapshotProcedure(procExec.getEnvironment(), snapshotProto));
UTIL.getHBaseCluster().getMaster().getSnapshotManager().registerSnapshotProcedure(snapshotProto,
snapshotProcId);

RegionInfo[] regionsToMerge = new RegionInfo[2];
regionsToMerge[0] = tableRegions.get(0);
regionsToMerge[1] = tableRegions.get(1);

long mergeProcId = procExec.submitProcedure(
new MergeTableRegionsProcedure(procExec.getEnvironment(), regionsToMerge, true));

ProcedureTestingUtility
.waitProcedure(UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(), mergeProcId);
ProcedureTestingUtility.waitProcedure(
UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(), snapshotProcId);

assertProcFailed(procExec, mergeProcId);
assertEquals(initialRegionCount, UTIL.getAdmin().getRegions(tableName).size());
}

private List<RegionInfo> createTable(final TableName tableName) throws Exception {
TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY)).build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.SnapshotDescription;
import org.apache.hadoop.hbase.client.SnapshotType;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
Expand All @@ -50,10 +52,12 @@
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
import org.apache.hadoop.hbase.master.procedure.TestSnapshotProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
Expand All @@ -69,6 +73,9 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;

@Category({ MasterTests.class, MediumTests.class })
public class TestSplitTableRegionProcedure {

Expand Down Expand Up @@ -550,6 +557,52 @@ public void testSplitWithoutPONR() throws Exception {
verify(tableName, splitRowNum);
}

@Test
public void testSplitRegionWhileTakingSnapshot() throws Exception {
final TableName tableName = TableName.valueOf(name.getMethodName());
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();

RegionInfo[] regions = MasterProcedureTestingUtility.createTable(procExec, tableName, null,
columnFamilyName1, columnFamilyName2);
int splitRowNum = startRowNum + rowCount / 2;
byte[] splitKey = Bytes.toBytes("" + splitRowNum);

assertTrue("not able to find a splittable region", regions != null);
assertTrue("not able to find a splittable region", regions.length == 1);
ProcedureTestingUtility.waitNoProcedureRunning(procExec);

// task snapshot
SnapshotDescription snapshot =
new SnapshotDescription("SnapshotProcedureTest", tableName, SnapshotType.FLUSH);
SnapshotProtos.SnapshotDescription snapshotProto =
ProtobufUtil.createHBaseProtosSnapshotDesc(snapshot);
snapshotProto = SnapshotDescriptionUtils.validate(snapshotProto,
UTIL.getHBaseCluster().getMaster().getConfiguration());
long snapshotProcId = procExec.submitProcedure(
new TestSnapshotProcedure.DelaySnapshotProcedure(procExec.getEnvironment(), snapshotProto));
UTIL.getHBaseCluster().getMaster().getSnapshotManager().registerSnapshotProcedure(snapshotProto,
snapshotProcId);

// collect AM metrics before test
collectAssignmentManagerMetrics();

// Split region of the table
long procId = procExec.submitProcedure(
new SplitTableRegionProcedure(procExec.getEnvironment(), regions[0], splitKey));
// Wait the completion
ProcedureTestingUtility.waitProcedure(procExec, procId);
ProcedureTestingUtility.waitProcedure(procExec, snapshotProcId);

ProcedureTestingUtility.assertProcFailed(procExec, procId);
ProcedureTestingUtility.assertProcNotFailed(procExec, snapshotProcId);

assertTrue(UTIL.getMiniHBaseCluster().getRegions(tableName).size() == 1);
assertTrue(UTIL.countRows(tableName) == 0);

assertEquals(splitSubmittedCount + 1, splitProcMetrics.getSubmittedCounter().getCount());
assertEquals(splitFailedCount + 1, splitProcMetrics.getFailedCounter().getCount());
}

private void deleteData(final TableName tableName, final int startDeleteRowNum)
throws IOException, InterruptedException {
Table t = UTIL.getConnection().getTable(tableName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
*/
package org.apache.hadoop.hbase.master.procedure;

import static org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.SnapshotState.SNAPSHOT_SNAPSHOT_ONLINE_REGIONS;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
Expand Down Expand Up @@ -51,6 +53,7 @@
import org.slf4j.LoggerFactory;

import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.SnapshotState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;

Expand All @@ -70,6 +73,27 @@ public class TestSnapshotProcedure {
protected SnapshotDescription snapshot;
protected SnapshotProtos.SnapshotDescription snapshotProto;

public static final class DelaySnapshotProcedure extends SnapshotProcedure {
public DelaySnapshotProcedure() {
}

public DelaySnapshotProcedure(final MasterProcedureEnv env,
final SnapshotProtos.SnapshotDescription snapshot) {
super(env, snapshot);
}

@Override
protected Flow executeFromState(MasterProcedureEnv env,
MasterProcedureProtos.SnapshotState state)
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
Flow flow = super.executeFromState(env, state);
if (state == SNAPSHOT_SNAPSHOT_ONLINE_REGIONS) {
TimeUnit.SECONDS.sleep(20);
}
return flow;
}
}

@Before
public void setup() throws Exception {
TEST_UTIL = new HBaseTestingUtility();
Expand Down

0 comments on commit 1d7ea88

Please sign in to comment.