Skip to content

Commit

Permalink
HBASE-24128 [Flakey Tests] Add retry on thrift cmdline if client fail…
Browse files Browse the repository at this point in the history
…s plus misc debug (#1442)

hbase-server/src/test/java/org/apache/hadoop/hbase/TestClusterPortAssignment.java
 Saw case where Master failed startup but it came out as an IOE so we
 did not trip the retry logic.

hbase-server/src/test/java/org/apache/hadoop/hbase/TestInfoServers.java
 Add some debug and up timeouts. This test fails frequently for me
 locally.

hbase-server/src/test/java/org/apache/hadoop/hbase/client/locking/TestEntityLocks.java
 Up the wait from 2x 200ms to 10x in case a pause on hardware or GC.
 This test fails locally and up on jenkins.

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestClearRegionBlockCache.java
 Debug. Have assert say what bad count was.

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactingToCellFlatMapMemStore.java
 Fails on occasion. Found count is off by a few. Tricky to debug. HBASE-24129 to reenable.

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionMergeTransactionOnCluster.java
 Debug. Add wait and check before moving to assert.

hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftHttpServer.java
 Check for null before shutting; can be null if failed start.

hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftServerCmdLine.java
 Add retry if client messes up connection. Fails for me locally.
  • Loading branch information
saintstack committed Apr 7, 2020
1 parent e71c442 commit 80d4a09
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public void testClusterPortAssignment() throws Exception {
cluster.getRegionServer(0).getRpcServer().getListenerAddress().getPort());
assertEquals("RS info port is incorrect", rsInfoPort,
cluster.getRegionServer(0).getInfoServer().getPort());
} catch (BindException|UnsupportedOperationException e) {
} catch (Exception e) {
if (e instanceof BindException || e.getCause() != null &&
(e.getCause() instanceof BindException || e.getCause().getCause() != null &&
e.getCause().getCause() instanceof BindException)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand All @@ -20,12 +20,12 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.testclassification.MiscTests;
import org.apache.hadoop.hbase.util.Bytes;
Expand Down Expand Up @@ -85,8 +85,8 @@ public void testGetMasterInfoPort() throws Exception {
}

/**
* Ensure when we go to top level index pages that we get redirected to an info-server specific status
* page.
* Ensure when we go to top level index pages that we get redirected to an info-server specific
* status page.
*/
@Test
public void testInfoServersRedirect() throws Exception {
Expand Down Expand Up @@ -121,9 +121,10 @@ public void testMasterServerReadOnly() throws Exception {
byte[] cf = Bytes.toBytes("d");
UTIL.createTable(tableName, cf);
UTIL.waitTableAvailable(tableName);
int port = UTIL.getHBaseCluster().getMaster().getInfoServer().getPort();
assertDoesNotContainContent(new URL("http://localhost:" + port + "/table.jsp?name=" + tableName
+ "&action=split&key="), "Table action request accepted");
HMaster master = UTIL.getHBaseCluster().getMaster();
int port = master.getRegionServerInfoPort(master.getServerName());
assertDoesNotContainContent(new URL("http://localhost:" + port + "/table.jsp?name=" +
tableName + "&action=split&key="), "Table action request accepted");
assertDoesNotContainContent(
new URL("http://localhost:" + port + "/table.jsp?name=" + tableName), "Actions:");
}
Expand All @@ -143,11 +144,11 @@ private void assertDoesNotContainContent(final URL u, final String expected) thr

private String getUrlContent(URL u) throws IOException {
java.net.URLConnection c = u.openConnection();
c.setConnectTimeout(2000);
c.setReadTimeout(2000);
c.setConnectTimeout(20000);
c.setReadTimeout(20000);
c.connect();
try (InputStream in = c.getInputStream()) {
return IOUtils.toString(in);
return IOUtils.toString(in, HConstants.UTF8_ENCODING);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ public void testEntityLockTimeout() throws Exception {
lock.requestLock();
lock.await();
assertTrue(lock.isLocked());
// Should get unlocked in next heartbeat i.e. after workerSleepTime. Wait 2x time.
assertTrue(waitLockTimeOut(lock, 2 * workerSleepTime));
// Should get unlocked in next heartbeat i.e. after workerSleepTime. Wait 10x time to be sure.
assertTrue(waitLockTimeOut(lock, 10 * workerSleepTime));
assertFalse(lock.getWorker().isAlive());
verify(abortable, times(1)).abort(any(), eq(null));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,16 @@
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the
* CPUs.
*/
@Category({ MasterTests.class, MediumTests.class })
public class TestCloseRegionWhileRSCrash {
private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class);

@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
Expand Down Expand Up @@ -176,6 +179,7 @@ public void testRetryBackoff() throws IOException, InterruptedException {
try {
UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName());
} catch (IOException e) {
LOG.info("Failed move of {}", region.getRegionNameAsString(), e);
}
});
t.start();
Expand All @@ -185,12 +189,13 @@ public void testRetryBackoff() throws IOException, InterruptedException {
// wait until the timeout value increase three times
ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3);
// close connection to make sure that we can not finish the TRSP
HMaster master = UTIL.getMiniHBaseCluster().getMaster();
final HMaster master = UTIL.getMiniHBaseCluster().getMaster();
master.getConnection().close();
RESUME.countDown();
UTIL.waitFor(30000, () -> !master.isAlive());
// here we start a new master
UTIL.getMiniHBaseCluster().startMaster();
HMaster master2 = UTIL.getMiniHBaseCluster().startMaster().getMaster();
LOG.info("Master2 {}, joining move thread", master2.getServerName());
t.join();
// Make sure that the region is online, it may not on the original target server, as we will set
// forceNewPlan to true if there is a server crash
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,10 @@ public void testClearBlockCache() throws Exception {
HTU.getNumHFilesForRS(rs2, TABLE_NAME, FAMILY));
clearRegionBlockCache(rs2);

assertEquals(initialBlockCount1, blockCache1.getBlockCount());
assertEquals(initialBlockCount2, blockCache2.getBlockCount());
assertEquals("" + blockCache1.getBlockCount(),
initialBlockCount1, blockCache1.getBlockCount());
assertEquals("" + blockCache2.getBlockCount(),
initialBlockCount2, blockCache2.getBlockCount());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,9 @@ public void testFlatteningToJumboCellChunkMap() throws IOException {
* testForceCopyOfBigCellIntoImmutableSegment checks that the
* ImmutableMemStoreLAB's forceCopyOfBigCellInto does what it's supposed to do.
*/
@Test
@org.junit.Ignore @Test // Flakey. Disabled by HBASE-24128. HBASE-24129 is for reenable.
// TestCompactingToCellFlatMapMemStore.testForceCopyOfBigCellIntoImmutableSegment:902 i=1
// expected:<8389924> but was:<8389992>
public void testForceCopyOfBigCellIntoImmutableSegment() throws IOException {

if (toCellChunkMap == false) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
Expand Down Expand Up @@ -65,6 +66,7 @@
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.PairOfSameType;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
Expand All @@ -76,11 +78,9 @@
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;

import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
Expand Down Expand Up @@ -129,7 +129,9 @@ public static void beforeAllTests() throws Exception {
@AfterClass
public static void afterAllTests() throws Exception {
TEST_UTIL.shutdownMiniCluster();
if (ADMIN != null) ADMIN.close();
if (ADMIN != null) {
ADMIN.close();
}
}

@Test
Expand Down Expand Up @@ -285,8 +287,19 @@ public void testCleanMergeReference() throws Exception {
// cleaned up by the time we got here making the test sometimes flakey.
assertTrue(cleaned > 0);

mergedRegionResult = MetaTableAccessor.getRegionResult(
TEST_UTIL.getConnection(), mergedRegionInfo.getRegionName());
// Wait around a bit to give stuff a chance to complete.
while (true) {
mergedRegionResult = MetaTableAccessor
.getRegionResult(TEST_UTIL.getConnection(), mergedRegionInfo.getRegionName());
if (MetaTableAccessor.hasMergeRegions(mergedRegionResult.rawCells())) {
LOG.info("Waiting on cleanup of merge columns {}",
Arrays.asList(mergedRegionResult.rawCells()).stream().
map(c -> c.toString()).collect(Collectors.joining(",")));
Threads.sleep(50);
} else {
break;
}
}
assertFalse(MetaTableAccessor.hasMergeRegions(mergedRegionResult.rawCells()));
} finally {
ADMIN.catalogJanitorSwitch(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,13 @@ protected void talkToThriftServer(String url, int customHeaderSize) throws Excep
}

private void stopHttpServerThread() throws Exception {
LOG.debug("Stopping " + " Thrift HTTP server");
thriftServer.stop();
httpServerThread.join();
LOG.debug("Stopping Thrift HTTP server {}", thriftServer);
if (thriftServer != null) {
thriftServer.stop();
}
if (httpServerThread != null) {
httpServerThread.join();
}
if (httpServerException != null) {
LOG.error("Command-line invocation of HBase Thrift server threw an " +
"exception", httpServerException);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,14 +297,18 @@ private boolean isBindException(Exception cmdLineException) {
@Test
public void testRunThriftServer() throws Exception {
ThriftServer thriftServer = createBoundServer();
try {
talkToThriftServer();
} catch (Exception ex) {
clientSideException = ex;
LOG.info("Exception", ex);
} finally {
stopCmdLineThread();
thriftServer.stop();
// Add retries in case we see stuff like connection reset
for (int i = 0; i < 10; i++) {
try {
talkToThriftServer();
break;
} catch (Exception ex) {
clientSideException = ex;
LOG.info("Exception", ex);
} finally {
stopCmdLineThread();
thriftServer.stop();
}
}

if (clientSideException != null) {
Expand Down

0 comments on commit 80d4a09

Please sign in to comment.