From 999980ac69d4f346ec3395e001f10abe1e763fa5 Mon Sep 17 00:00:00 2001 From: root Date: Sun, 29 Jul 2018 21:29:48 -0400 Subject: [PATCH] msg: ceph_abort() when there are enough accepter errors in msg server In some extrem cases(we have met one in our production cluster), when Accepter thread break out , new client can not connect to the osd. Because the former heartbeat connections are already connected, other osd can not detect failure then notify monitor to mark the failed osd down. In the patch, we there are abnormal communication errors ,we just ceph_abort so that osd can go down fastly and other osds can notify monitor to mark the failed osd down. Signed-off-by: penglaiyxy@gmail.com (cherry picked from commit 00e0ab407b2e9659d9121be1217e95c8117c411e) Conflicts: src/common/legacy_config_opts.h : Resolved for ms_max_accept_failures src/common/options.cc : Resolved for ms_max_accept_failures src/msg/async/AsyncMessenger.cc : Resolved in accept src/msg/simple/Accepter.cc : Resolved in entry --- src/common/legacy_config_opts.h | 4 ++++ src/common/options.cc | 5 +++++ src/msg/async/AsyncMessenger.cc | 14 ++++++++++++-- src/msg/simple/Accepter.cc | 10 ++++++---- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h index 9c97aae821815..a51870ef64289 100644 --- a/src/common/legacy_config_opts.h +++ b/src/common/legacy_config_opts.h @@ -169,6 +169,10 @@ OPTION(ms_async_rdma_roce_ver, OPT_INT) // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1. OPTION(ms_async_rdma_sl, OPT_INT) // in RoCE, this means PCP OPTION(ms_async_rdma_dscp, OPT_INT) // in RoCE, this means DSCP +// when there are enough accept failures, indicating there are unrecoverable failures, +// just do ceph_abort() . Here we make it configurable. +OPTION(ms_max_accept_failures, OPT_INT) + OPTION(ms_dpdk_port_id, OPT_INT) SAFE_OPTION(ms_dpdk_coremask, OPT_STR) // it is modified in unittest so that use SAFE_OPTION to declare OPTION(ms_dpdk_memory_channel, OPT_STR) diff --git a/src/common/options.cc b/src/common/options.cc index b0c7ccc252f33..ff3bb1a1be193 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -812,6 +812,11 @@ std::vector