Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
706e1f0
ZOOKEEPER-3320: configurable retry count for election port bind in Qu…
Mar 21, 2019
b448f36
ZOOKEEPER-3320: add validation and logging of zookeeper.electionPortB…
Mar 22, 2019
883d35e
ZOOKEEPER-3320: add documentation for zookeeper.electionPortBindRetry
Lagrang Mar 25, 2019
9142958
ZOOKEEPER-3320: QuorumCnxManager.Listener extends ZookeeperCriticalTh…
Mar 27, 2019
bb0c77f
ZOOKEEPER-3320: use existing scheme to stop server when QuorumCnxMana…
Jul 11, 2019
a541ee9
Merge branch 'master' into ZOOKEEPER-3320
Jul 29, 2019
e9db1e4
ZOOKEEPER-3320: fix of test compilation
Jul 29, 2019
c1afdf9
Merge branch 'master' into ZOOKEEPER-3320
Aug 1, 2019
b4abdc7
ZOOKEEPER-3320: handle 0 value for zookeeper.electionPortBindRetry as…
Aug 1, 2019
e25b445
ZOOKEEPER-3320: support custom socket bind error handler in QuorumCnx…
Aug 2, 2019
da33c1d
ZOOKEEPER-3320: configurable retry count for election port bind in Qu…
Mar 21, 2019
a9a9342
ZOOKEEPER-3320: add validation and logging of zookeeper.electionPortB…
Mar 22, 2019
0888a29
ZOOKEEPER-3320: add documentation for zookeeper.electionPortBindRetry
Lagrang Mar 25, 2019
587fd95
ZOOKEEPER-3320: QuorumCnxManager.Listener extends ZookeeperCriticalTh…
Mar 27, 2019
eeb5c41
ZOOKEEPER-3320: use existing scheme to stop server when QuorumCnxMana…
Jul 11, 2019
5051b4c
ZOOKEEPER-3320: fix of test compilation
Jul 29, 2019
7b222ef
ZOOKEEPER-3320: handle 0 value for zookeeper.electionPortBindRetry as…
Aug 1, 2019
1af098d
ZOOKEEPER-3320: support custom socket bind error handler in QuorumCnx…
Aug 2, 2019
f95ee18
ZOOKEEPER-3320: CnxManagerTest.testCnxManagerListenerThreadConfigurab…
Aug 2, 2019
fb9cdc5
Merge remote-tracking branch 'lagrang/ZOOKEEPER-3320' into ZOOKEEPER-…
Aug 2, 2019
50d6465
ZOOKEEPER-3320: doc fix, rename config property 'zookeeper.electionPo…
Aug 2, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
Original file line number Diff line number Diff line change
Expand Up @@ -1088,6 +1088,19 @@ As an example, this will enable all four letter word commands:
properly, check your operating system's options regarding TCP
keepalive for more information. Defaults to
**false**.

* *electionPortBindRetry* :
(Java system property only: **zookeeper.electionPortBindRetry**)
Property set max retry count when Zookeeper server fails to bind
leader election port. Such errors can be temporary and recoverable,
such as DNS issue described in [ZOOKEEPER-3320](https://issues.apache.org/jira/projects/ZOOKEEPER/issues/ZOOKEEPER-3320),
or non-retryable, such as port already in use.
In case of transient errors, this property can improve availability
of Zookeeper server and help it to self recover.
Default value 3. In container environment, especially in Kubernetes,
this value should be increased or set to 0(infinite retry) to overcome issues
related to DNS name resolving.


* *observer.reconnectDelayMs* :
(Java system property: **zookeeper.observer.reconnectDelayMs**)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

package org.apache.zookeeper.server.quorum;

import static org.apache.zookeeper.common.NetUtils.formatInetAddr;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
Expand All @@ -36,31 +38,28 @@
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.NoSuchElementException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import javax.net.ssl.SSLSocket;
import org.apache.zookeeper.common.X509Exception;
import org.apache.zookeeper.server.ExitCode;
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
import org.apache.zookeeper.server.util.ConfigUtils;
import org.apache.zookeeper.server.ZooKeeperThread;
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthLearner;
import org.apache.zookeeper.server.quorum.auth.QuorumAuthServer;
import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
import org.apache.zookeeper.server.util.ConfigUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.net.ssl.SSLSocket;
import static org.apache.zookeeper.common.NetUtils.formatInetAddr;

/**
* This class implements a connection manager for leader election using TCP. It
* maintains one connection for every pair of servers. The tricky part is to
Expand Down Expand Up @@ -848,12 +847,39 @@ private void resetConnectionThreadCount() {
*/
public class Listener extends ZooKeeperThread {

private static final String ELECTION_PORT_BIND_RETRY = "zookeeper.electionPortBindRetry";
private static final int DEFAULT_PORT_BIND_MAX_RETRY = 3;

private final int portBindMaxRetry;
private Runnable socketBindErrorHandler = () -> System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue());
volatile ServerSocket ss = null;

public Listener() {
// During startup of thread, thread name will be overridden to
// specific election address
super("ListenerThread");

// maximum retry count while trying to bind to election port
// see ZOOKEEPER-3320 for more details
final Integer maxRetry = Integer.getInteger(ELECTION_PORT_BIND_RETRY,
DEFAULT_PORT_BIND_MAX_RETRY);
if (maxRetry >= 0) {
LOG.info("Election port bind maximum retries is {}",
maxRetry == 0 ? "infinite" : maxRetry);
portBindMaxRetry = maxRetry;
} else {
LOG.info("'{}' contains invalid value: {}(must be >= 0). "
+ "Use default value of {} instead.",
ELECTION_PORT_BIND_RETRY, maxRetry, DEFAULT_PORT_BIND_MAX_RETRY);
portBindMaxRetry = DEFAULT_PORT_BIND_MAX_RETRY;
}
}

/**
* Change socket bind error handler. Used for testing.
*/
void setSocketBindErrorHandler(Runnable errorHandler) {
this.socketBindErrorHandler = errorHandler;
}

/**
Expand All @@ -865,7 +891,7 @@ public void run() {
InetSocketAddress addr;
Socket client = null;
Exception exitException = null;
while((!shutdown) && (numRetries < 3)){
while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) {
try {
if (self.shouldUsePortUnification()) {
LOG.info("Creating TLS-enabled quorum server socket");
Expand Down Expand Up @@ -935,15 +961,18 @@ public void run() {
}
LOG.info("Leaving listener");
if (!shutdown) {
LOG.error("As I'm leaving the listener thread, "
+ "I won't be able to participate in leader "
+ "election any longer: "
+ formatInetAddr(self.getElectionAddress()));
if (exitException instanceof BindException) {
LOG.error("As I'm leaving the listener thread after "
+ numRetries + " errors. "
+ "I won't be able to participate in leader "
+ "election any longer: "
+ formatInetAddr(self.getElectionAddress())
+ ". Use " + ELECTION_PORT_BIND_RETRY + " property to "
+ "increase retry count.");
if (exitException instanceof SocketException) {
// After leaving listener thread, the host cannot join the
// quorum anymore, this is a severe error that we cannot
// recover from, so we need to exit
System.exit(ExitCode.UNABLE_TO_BIND_QUORUM_PORT.getValue());
socketBindErrorHandler.run();
}
} else if (ss != null) {
// Clean up for shutdown.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.net.Socket;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import javax.net.ssl.HandshakeCompletedListener;
Expand Down Expand Up @@ -290,6 +291,36 @@ public void testCnxManagerSpinLock() throws Exception {
Assert.assertFalse(cnxManager.listener.isAlive());
}

/**
* Test for bug described in {@link https://issues.apache.org/jira/browse/ZOOKEEPER-3320}.
* Test create peer with address which contains unresolvable DNS name,
* leader election listener thread should stop after N errors.
*
* @throws Exception
*/
@Test
public void testCnxManagerListenerThreadConfigurableRetry() throws Exception {
final Map<Long,QuorumServer> unresolvablePeers = new HashMap<>();
final long myid = 1L;
unresolvablePeers.put(myid, new QuorumServer(myid, "unresolvable-domain.org:2182:2183;2181"));
final QuorumPeer peer = new QuorumPeer(unresolvablePeers,
ClientBase.createTmpDir(),
ClientBase.createTmpDir(),
2181, 3, myid, 1000, 2, 2, 2);
final QuorumCnxManager cnxManager = peer.createCnxnManager();
final QuorumCnxManager.Listener listener = cnxManager.listener;
final AtomicBoolean errorHappend = new AtomicBoolean();
listener.setSocketBindErrorHandler(() -> errorHappend.set(true));
listener.start();
// listener thread should stop and throws error which notify QuorumPeer about error.
// QuorumPeer should start shutdown process
listener.join(15000); // set wait time, if listener contains bug and thread not stops.
Assert.assertFalse(listener.isAlive());
Assert.assertTrue(errorHappend.get());
Assert.assertFalse(QuorumPeer.class.getSimpleName() + " not stopped after "
+ "listener thread death", listener.isAlive());
}

/**
* Tests a bug in QuorumCnxManager that causes a NPE when a 3.4.6
* observer connects to a 3.5.0 server.
Expand Down