Skip to content

Commit

Permalink
Merge pull request #24419 from pdvian/wip-36157-luminous
Browse files Browse the repository at this point in the history
luminous: msg: ceph_abort() when there are enough accepter errors in msg server

Reviewed-by: Kefu Chai <kchai@redhat.com>
  • Loading branch information
yuriw committed Oct 5, 2018
2 parents 593d3a4 + 999980a commit 3f8d7d4
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 6 deletions.
4 changes: 4 additions & 0 deletions src/common/legacy_config_opts.h
Expand Up @@ -169,6 +169,10 @@ OPTION(ms_async_rdma_roce_ver, OPT_INT) // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.
OPTION(ms_async_rdma_sl, OPT_INT) // in RoCE, this means PCP
OPTION(ms_async_rdma_dscp, OPT_INT) // in RoCE, this means DSCP

// when there are enough accept failures, indicating there are unrecoverable failures,
// just do ceph_abort() . Here we make it configurable.
OPTION(ms_max_accept_failures, OPT_INT)

OPTION(ms_dpdk_port_id, OPT_INT)
SAFE_OPTION(ms_dpdk_coremask, OPT_STR) // it is modified in unittest so that use SAFE_OPTION to declare
OPTION(ms_dpdk_memory_channel, OPT_STR)
Expand Down
5 changes: 5 additions & 0 deletions src/common/options.cc
Expand Up @@ -812,6 +812,11 @@ std::vector<Option> get_global_options() {
.set_default(96)
.set_description(""),

Option("ms_max_accept_failures", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(4)
.set_description("The maximum number of consecutive failed accept() calls before "
"considering the daemon is misconfigured and abort it."),

Option("ms_dpdk_port_id", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description(""),
Expand Down
14 changes: 12 additions & 2 deletions src/msg/async/AsyncMessenger.cc
Expand Up @@ -165,6 +165,8 @@ void Processor::accept()
opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
opts.priority = msgr->get_socket_priority();
unsigned accept_error_num = 0;

while (true) {
entity_addr_t addr;
ConnectedSocket cli_socket;
Expand All @@ -185,15 +187,23 @@ void Processor::accept()
} else if (r == -EMFILE || r == -ENFILE) {
lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
<< " errno " << r << " " << cpp_strerror(r) << dendl;
break;
if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
ceph_abort();
}
continue;
} else if (r == -ECONNABORTED) {
ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
<< " errno " << r << " " << cpp_strerror(r) << dendl;
continue;
} else {
lderr(msgr->cct) << __func__ << " no incoming connection?"
<< " errno " << r << " " << cpp_strerror(r) << dendl;
break;
if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
ceph_abort();
}
continue;
}
}
}
Expand Down
10 changes: 6 additions & 4 deletions src/msg/simple/Accepter.cc
Expand Up @@ -293,7 +293,7 @@ void *Accepter::entry()
}
ldout(msgr->cct,1) << __func__ << " poll got error"
<< " errno " << errno << " " << cpp_strerror(errno) << dendl;
break;
ceph_abort();
}
ldout(msgr->cct,10) << __func__ << " poll returned oke: " << r << dendl;
ldout(msgr->cct,20) << __func__ << " pfd.revents[0]=" << pfd[0].revents << dendl;
Expand All @@ -302,7 +302,7 @@ void *Accepter::entry()
if (pfd[0].revents & (POLLERR | POLLNVAL | POLLHUP)) {
ldout(msgr->cct,1) << __func__ << " poll got errors in revents "
<< pfd[0].revents << dendl;
break;
ceph_abort();
}
if (pfd[1].revents & (POLLIN | POLLERR | POLLNVAL | POLLHUP)) {
// We got "signaled" to exit the poll
Expand All @@ -329,8 +329,10 @@ void *Accepter::entry()
int e = errno;
ldout(msgr->cct,0) << __func__ << " no incoming connection? sd = " << sd
<< " errno " << e << " " << cpp_strerror(e) << dendl;
if (++errors > 4)
break;
if (++errors > msgr->cct->_conf->ms_max_accept_failures) {
lderr(msgr->cct) << "accetper has encoutered enough errors, just do ceph_abort()." << dendl;
ceph_abort();
}
}
}

Expand Down

0 comments on commit 3f8d7d4

Please sign in to comment.