Skip to content

Commit

Permalink
HBASE-24034 [Flakey Tests] A couple of fixes and cleanups
Browse files Browse the repository at this point in the history
hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupMajorCompactionTTL.java
 Remove spurious assert. Just before this it waits an arbitrary 10
 seconds. Compactions could have completed inside this time. The spirit
 of the test remains.

hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/HFileCleaner.java
 Get log cleaner to go down promptly; its sticking around. See if this
 helps with TestMasterShutdown

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
 We get a rare NPE trying to sync. Make local copy of SyncFuture and see
 if that helps.

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncRegionAdminApi.java
 Compaction  may have completed when not expected; allow for it.

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestBlockEvictionFromClient.java
 Add wait before testing. Compaction may not have completed. Let
 compaction complete before progressing and then test for empty cache.

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterShutdown.java
 Less resources.

hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestDefaultLoadBalancer.java
 Less resources.

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestEndToEndSplitTransaction.java
 Wait till online before we try and do compaction (else request is
 ignored)

hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java
 Disable test that fails randomly w/ mockito complaint on some mac os
 x's.

TestMasterShutdown... fix NPE in RSRpcDispatcher... catch it and covert
to false and have master check for successful startup.
  • Loading branch information
saintstack committed Mar 23, 2020
1 parent af4fdcc commit afc1746
Show file tree
Hide file tree
Showing 13 changed files with 84 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ static <T> CompletableFuture<T> getOrFetch(AtomicReference<T> cacheRef,
}
LOG.trace("{} cache is null, try fetching from registry", type);
if (futureRef.compareAndSet(null, new CompletableFuture<>())) {
LOG.debug("Start fetching{} from registry", type);
LOG.debug("Start fetching {} from registry", type);
CompletableFuture<T> future = futureRef.get();
addListener(fetch.get(), (value, error) -> {
if (error != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1573,7 +1573,9 @@ public void abortProcess() {
// startProcedureExecutor. See the javadoc for finishActiveMasterInitialization for more
// details.
procedureExecutor.init(numThreads, abortOnCorruption);
procEnv.getRemoteDispatcher().start();
if (!procEnv.getRemoteDispatcher().start()) {
throw new HBaseIOException("Failed start of remote dispatcher");
}
}

private void startProcedureExecutor() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public HFileCleaner(final int period, final Stoppable stopper, Configuration con
private long cleanerThreadTimeoutMsec;
private long cleanerThreadCheckIntervalMsec;
private List<Thread> threads = new ArrayList<Thread>();
private boolean running;
private volatile boolean running;

private AtomicLong deletedLargeFiles = new AtomicLong();
private AtomicLong deletedSmallFiles = new AtomicLong();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.ServerListener;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
Expand Down Expand Up @@ -93,11 +94,16 @@ public boolean start() {
if (!super.start()) {
return false;
}

master.getServerManager().registerListener(this);
procedureEnv = master.getMasterProcedureExecutor().getEnvironment();
for (ServerName serverName: master.getServerManager().getOnlineServersList()) {
addNode(serverName);
// Around startup, if failed, some of the below may be set back to null so NPE is possible.
try {
master.getServerManager().registerListener(this);
procedureEnv = master.getMasterProcedureExecutor().getEnvironment();
for (ServerName serverName : master.getServerManager().getOnlineServersList()) {
addNode(serverName);
}
} catch (Exception e) {
LOG.info("Failed start", e);
return false;
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -552,20 +552,28 @@ public void run() {
int syncCount = 0;

try {
// Make a local copy of takeSyncFuture after we get it. We've been running into NPEs
// 2020-03-22 16:54:32,180 WARN [sync.1] wal.FSHLog$SyncRunner(589): UNEXPECTED
// java.lang.NullPointerException
// at org.apache.hadoop.hbase.regionserver.wal.FSHLog$SyncRunner.run(FSHLog.java:582)
// at java.lang.Thread.run(Thread.java:748)
SyncFuture sf;
while (true) {
takeSyncFuture = null;
// We have to process what we 'take' from the queue
takeSyncFuture = this.syncFutures.take();
// Make local copy.
sf = takeSyncFuture;
currentSequence = this.sequence;
long syncFutureSequence = takeSyncFuture.getTxid();
long syncFutureSequence = sf.getTxid();
if (syncFutureSequence > currentSequence) {
throw new IllegalStateException("currentSequence=" + currentSequence
+ ", syncFutureSequence=" + syncFutureSequence);
}
// See if we can process any syncfutures BEFORE we go sync.
long currentHighestSyncedSequence = highestSyncedTxid.get();
if (currentSequence < currentHighestSyncedSequence) {
syncCount += releaseSyncFuture(takeSyncFuture, currentHighestSyncedSequence, null);
syncCount += releaseSyncFuture(sf, currentHighestSyncedSequence, null);
// Done with the 'take'. Go around again and do a new 'take'.
continue;
}
Expand All @@ -579,7 +587,7 @@ public void run() {
Throwable lastException = null;
try {
TraceUtil.addTimelineAnnotation("syncing writer");
writer.sync(takeSyncFuture.isForceSync());
writer.sync(sf.isForceSync());
TraceUtil.addTimelineAnnotation("writer synced");
currentSequence = updateHighestSyncedSequence(currentSequence);
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -434,11 +434,11 @@ private void compactionTest(final TableName tableName, final int flushes,
} else {
int singleFamDiff = countBeforeSingleFamily - countAfterSingleFamily;
// assert only change was to single column family
assertTrue(singleFamDiff == (countBefore - countAfter));
assertEquals(singleFamDiff, (countBefore - countAfter));
if (expectedState == CompactionState.MAJOR) {
assertTrue(1 == countAfterSingleFamily);
assertEquals(1, countAfterSingleFamily);
} else {
assertTrue(1 < countAfterSingleFamily);
assertTrue("" + countAfterSingleFamily, 1 <= countAfterSingleFamily);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
Expand Down Expand Up @@ -601,11 +602,18 @@ public void testBlockRefCountAfterSplits() throws IOException, InterruptedExcept
region.flush(true);
ServerName rs = Iterables.getOnlyElement(TEST_UTIL.getAdmin().getRegionServers());
int regionCount = TEST_UTIL.getAdmin().getRegions(rs).size();
LOG.info("About to SPLIT on " + Bytes.toString(ROW1));
LOG.info("About to SPLIT on {} {}, count={}", Bytes.toString(ROW1), region.getRegionInfo(),
regionCount);
TEST_UTIL.getAdmin().split(tableName, ROW1);
// Wait for splits
TEST_UTIL.waitFor(60000, () -> TEST_UTIL.getAdmin().getRegions(rs).size() > regionCount);
region.compact(true);
List<HRegion> regions = TEST_UTIL.getMiniHBaseCluster().getRegionServer(rs).getRegions();
for (HRegion r: regions) {
LOG.info("" + r.getCompactionState());
TEST_UTIL.waitFor(30000, () -> r.getCompactionState().equals(CompactionState.NONE));
}
LOG.info("Split finished, is region closed {} {}", region.isClosed(), cache);
Iterator<CachedBlock> iterator = cache.iterator();
// Though the split had created the HalfStorefileReader - the firstkey and lastkey scanners
// should be closed inorder to return those blocks
Expand Down Expand Up @@ -1212,8 +1220,10 @@ private void iterateBlockCache(BlockCache cache, Iterator<CachedBlock> iterator)
BlockCacheKey cacheKey = new BlockCacheKey(next.getFilename(), next.getOffset());
if (cache instanceof BucketCache) {
refCount = ((BucketCache) cache).getRpcRefCount(cacheKey);
LOG.info("BucketCache {} {}", cacheKey, refCount);
} else if (cache instanceof CombinedBlockCache) {
refCount = ((CombinedBlockCache) cache).getRpcRefCount(cacheKey);
LOG.info("CombinedBlockCache {} {}", cacheKey, refCount);
} else {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ public void testMasterShutdown() throws Exception {
htu = new HBaseTestingUtility(conf);
StartMiniClusterOption option = StartMiniClusterOption.builder()
.numMasters(3)
.numRegionServers(3)
.numDataNodes(3)
.numRegionServers(1)
.numDataNodes(1)
.build();
final MiniHBaseCluster cluster = htu.startMiniCluster(option);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,12 @@ public static void beforeAllTests() throws Exception {
*
* Invariant is that all servers should be hosting either floor(average) or
* ceiling(average) at both table level and cluster level
*
* @throws Exception
*/
@Test
public void testBalanceClusterOverall() throws Exception {
Map<TableName, Map<ServerName, List<RegionInfo>>> clusterLoad = new TreeMap<>();
for (int[] mockCluster : clusterStateMocks) {
Map<ServerName, List<RegionInfo>> clusterServers = mockClusterServers(mockCluster, 50);
Map<ServerName, List<RegionInfo>> clusterServers = mockClusterServers(mockCluster, 30);
List<ServerAndLoad> clusterList = convertToList(clusterServers);
clusterLoad.put(TableName.valueOf(name.getMethodName()), clusterServers);
HashMap<TableName, TreeMap<ServerName, List<RegionInfo>>> result =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ public void testUserClusterScopeQuota() throws Exception {
triggerUserCacheRefresh(TEST_UTIL, true, TABLE_NAMES);
}

@org.junit.Ignore @Test // Spews the log w/ triggering of scheduler?
@org.junit.Ignore @Test // Spews the log w/ triggering of scheduler? HBASE-24035
public void testUserNamespaceClusterScopeQuota() throws Exception {
final Admin admin = TEST_UTIL.getAdmin();
final String userName = User.getCurrent().getShortName();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand All @@ -21,7 +21,6 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -42,6 +41,7 @@
import org.apache.hadoop.hbase.ScheduledChore;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
import org.apache.hadoop.hbase.client.Connection;
Expand Down Expand Up @@ -69,7 +69,6 @@
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.collect.Iterators;
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
Expand Down Expand Up @@ -401,6 +400,17 @@ public static void flushAndBlockUntilDone(Admin admin, HRegionServer rs, byte[]
public static void compactAndBlockUntilDone(Admin admin, HRegionServer rs, byte[] regionName)
throws IOException, InterruptedException {
log("Compacting region: " + Bytes.toStringBinary(regionName));
// Wait till its online before we do compact else it comes back with NoServerForRegionException
try {
TEST_UTIL.waitFor(10000, new Waiter.Predicate<Exception>() {
@Override public boolean evaluate() throws Exception {
return rs.getServerName().equals(MetaTableAccessor.
getRegionLocation(admin.getConnection(), regionName).getServerName());
}
});
} catch (Exception e) {
throw new IOException(e);
}
admin.majorCompactRegion(regionName);
log("blocking until compaction is complete: " + Bytes.toStringBinary(regionName));
Threads.sleepWithoutInterrupt(500);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ public void testCompactingTables() throws Exception {
for (TableName tableName : tableNames) {
int numberOfRegions = admin.getRegions(tableName).size();
int numHFiles = utility.getNumHFiles(tableName, FAMILY);
// we should have a table with more store files than we would before we major compacted.
assertTrue(numberOfRegions < numHFiles);
modifyTTL(tableName);
}

Expand All @@ -105,4 +103,4 @@ public void testCompactingTables() throws Exception {
assertEquals(numberOfRegions, numHFiles);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.mockito.ArgumentMatchers.anyLong;
import static org.mockito.ArgumentMatchers.argThat;
import static org.mockito.Matchers.anyLong;
import static org.mockito.Matchers.eq;
import static org.mockito.Matchers.isA;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.ArgumentMatchers.isA;
import static org.mockito.Mockito.atLeastOnce;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.spy;
Expand Down Expand Up @@ -60,7 +60,7 @@
import org.junit.runner.RunWith;
import org.mockito.ArgumentMatcher;
import org.mockito.Mock;
import org.mockito.runners.MockitoJUnitRunner;
import org.mockito.junit.MockitoJUnitRunner;
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;

@RunWith(MockitoJUnitRunner.class)
Expand Down Expand Up @@ -184,16 +184,30 @@ public void testCanaryRegionTaskResult() throws Exception {
}
}

@Test
// Ignore this test. It fails w/ the below on some mac os x.
// [ERROR] Failures:
// [ERROR] TestCanaryTool.testReadTableTimeouts:216
// Argument(s) are different! Wanted:
// mockAppender.doAppend(
// <custom argument matcher>
// );
// -> at org.apache.hadoop.hbase.tool.TestCanaryTool.testReadTableTimeouts(TestCanaryTool.java:216)
// Actual invocations have different arguments:
// mockAppender.doAppend(
// org.apache.log4j.spi.LoggingEvent@2055cfc1
// );
// )
// )
//
@org.junit.Ignore @Test
public void testReadTableTimeouts() throws Exception {
final TableName [] tableNames = new TableName[2];
tableNames[0] = TableName.valueOf(name.getMethodName() + "1");
tableNames[1] = TableName.valueOf(name.getMethodName() + "2");
final TableName [] tableNames = new TableName[] {TableName.valueOf(name.getMethodName() + "1"),
TableName.valueOf(name.getMethodName() + "2")};
// Create 2 test tables.
for (int j = 0; j<2; j++) {
for (int j = 0; j < 2; j++) {
Table table = testingUtility.createTable(tableNames[j], new byte[][] { FAMILY });
// insert some test rows
for (int i=0; i<1000; i++) {
for (int i = 0; i < 10; i++) {
byte[] iBytes = Bytes.toBytes(i + j);
Put p = new Put(iBytes);
p.addColumn(FAMILY, COLUMN, iBytes);
Expand All @@ -209,7 +223,7 @@ public void testReadTableTimeouts() throws Exception {
name.getMethodName() + "2"};
assertEquals(0, ToolRunner.run(testingUtility.getConfiguration(), canary, args));
verify(sink, times(tableNames.length)).initializeAndGetReadLatencyForTable(isA(String.class));
for (int i=0; i<2; i++) {
for (int i = 0; i < 2; i++) {
assertNotEquals("verify non-null read latency", null,
sink.getReadLatencyMap().get(tableNames[i].getNameAsString()));
assertNotEquals("verify non-zero read latency", 0L,
Expand Down

0 comments on commit afc1746

Please sign in to comment.