Skip to content

Commit

Permalink
msg: ceph_abort() when there are enough accepter errors in msg server
Browse files Browse the repository at this point in the history
In some extrem cases(we have met one in our production cluster), when Accepter thread break out , new client can not connect to the osd. Because the former heartbeat connections are already connected, other osd can not detect failure then notify monitor to mark the failed osd down.
In the patch, we there are abnormal communication errors ,we just ceph_abort  so that osd can go down fastly and other osds can notify monitor to mark the failed osd down.
Signed-off-by: penglaiyxy@gmail.com <penglaiyxy@gmail.com>

(cherry picked from commit 00e0ab4)

Conflicts:
	src/common/legacy_config_opts.h : Resolved for ms_max_accept_failures
	src/common/options.cc : Resolved for ms_max_accept_failures
	src/msg/async/AsyncMessenger.cc : Resolved in accept
	src/msg/simple/Accepter.cc : Resolved in entry
  • Loading branch information
root authored and Prashant D committed Oct 4, 2018
1 parent 2c73ec0 commit 999980a
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 6 deletions.
4 changes: 4 additions & 0 deletions src/common/legacy_config_opts.h
Expand Up @@ -169,6 +169,10 @@ OPTION(ms_async_rdma_roce_ver, OPT_INT) // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.
OPTION(ms_async_rdma_sl, OPT_INT) // in RoCE, this means PCP
OPTION(ms_async_rdma_dscp, OPT_INT) // in RoCE, this means DSCP

// when there are enough accept failures, indicating there are unrecoverable failures,
// just do ceph_abort() . Here we make it configurable.
OPTION(ms_max_accept_failures, OPT_INT)

OPTION(ms_dpdk_port_id, OPT_INT)
SAFE_OPTION(ms_dpdk_coremask, OPT_STR) // it is modified in unittest so that use SAFE_OPTION to declare
OPTION(ms_dpdk_memory_channel, OPT_STR)
Expand Down
5 changes: 5 additions & 0 deletions src/common/options.cc
Expand Up @@ -812,6 +812,11 @@ std::vector<Option> get_global_options() {
.set_default(96)
.set_description(""),

Option("ms_max_accept_failures", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(4)
.set_description("The maximum number of consecutive failed accept() calls before "
"considering the daemon is misconfigured and abort it."),

Option("ms_dpdk_port_id", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
Expand Down
14 changes: 12 additions & 2 deletions src/msg/async/AsyncMessenger.cc
Expand Up @@ -165,6 +165,8 @@ void Processor::accept()
opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
opts.priority = msgr->get_socket_priority();
unsigned accept_error_num = 0;

while (true) {
entity_addr_t addr;
ConnectedSocket cli_socket;
Expand All @@ -185,15 +187,23 @@ void Processor::accept()
} else if (r == -EMFILE || r == -ENFILE) {
lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
<< " errno " << r << " " << cpp_strerror(r) << dendl;
break;
if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
ceph_abort();
}
continue;
} else if (r == -ECONNABORTED) {
ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
<< " errno " << r << " " << cpp_strerror(r) << dendl;
continue;
} else {
lderr(msgr->cct) << __func__ << " no incoming connection?"
<< " errno " << r << " " << cpp_strerror(r) << dendl;
break;
if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
ceph_abort();
}
continue;
}
}
}
Expand Down
10 changes: 6 additions & 4 deletions src/msg/simple/Accepter.cc
Expand Up @@ -293,7 +293,7 @@ void *Accepter::entry()
}
ldout(msgr->cct,1) << __func__ << " poll got error"
<< " errno " << errno << " " << cpp_strerror(errno) << dendl;
break;
ceph_abort();
}
ldout(msgr->cct,10) << __func__ << " poll returned oke: " << r << dendl;
ldout(msgr->cct,20) << __func__ << " pfd.revents[0]=" << pfd[0].revents << dendl;
Expand All @@ -302,7 +302,7 @@ void *Accepter::entry()
if (pfd[0].revents & (POLLERR | POLLNVAL | POLLHUP)) {
ldout(msgr->cct,1) << __func__ << " poll got errors in revents "
<< pfd[0].revents << dendl;
break;
ceph_abort();
}
if (pfd[1].revents & (POLLIN | POLLERR | POLLNVAL | POLLHUP)) {
// We got "signaled" to exit the poll
Expand All @@ -329,8 +329,10 @@ void *Accepter::entry()
int e = errno;
ldout(msgr->cct,0) << __func__ << " no incoming connection? sd = " << sd
<< " errno " << e << " " << cpp_strerror(e) << dendl;
if (++errors > 4)
break;
if (++errors > msgr->cct->_conf->ms_max_accept_failures) {
lderr(msgr->cct) << "accetper has encoutered enough errors, just do ceph_abort()." << dendl;
ceph_abort();
}
}
}

Expand Down

0 comments on commit 999980a

Please sign in to comment.