diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py index 61d23d596d0a6..082e7fa84d2b8 100644 --- a/qa/tasks/cephfs/test_client_recovery.py +++ b/qa/tasks/cephfs/test_client_recovery.py @@ -13,6 +13,7 @@ import string import os +from teuthology import contextutil from teuthology.orchestra import run from teuthology.orchestra.run import CommandFailedError from tasks.cephfs.fuse_mount import FuseMount @@ -750,24 +751,27 @@ def test_client_eviction_if_config_is_set(self): # it takes time to have laggy clients entries in cluster log, # wait for 6 minutes to see if it is visible, finally restart # the client - tries = 6 - while True: - try: - with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs", - timeout=55): - # make sure clients weren't evicted - self.assert_session_count(2) - break - except AssertionError: - tries -= 1 - if tries: - continue - raise + with contextutil.safe_while(sleep=5, tries=6) as proceed: + while proceed(): + try: + with self.assert_cluster_log("1 client(s) laggy due to" + " laggy OSDs", + timeout=55): + # make sure clients weren't evicted + self.assert_session_count(2) + break + except (AssertionError, CommandFailedError) as e: + log.debug(f'{e}, retrying') + + # clear lagginess, expect to get the warning cleared and make sure + # client gets evicted + self.clear_laggy_params(osd) + self.wait_for_health_clear(60) + self.assert_session_count(1) finally: self.mount_a.kill_cleanup() self.mount_a.mount_wait() self.mount_a.create_destroy() - self.clear_laggy_params(osd) def test_client_eviction_if_config_is_unset(self): """ @@ -799,6 +803,11 @@ def test_client_eviction_if_config_is_unset(self): time.sleep(session_timeout) self.assert_session_count(1) + + # make sure warning wasn't seen in cluster log + with self.assert_cluster_log("laggy due to laggy OSDs", + timeout=120, present=False): + pass finally: self.mount_a.kill_cleanup() self.mount_a.mount_wait() diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 76e71f2fe6e55..ffd3056b697cf 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -481,13 +481,17 @@ void Beacon::notify_health(MDSRank const *mds) // Report laggy client(s) due to laggy OSDs { + bool defer_client_eviction = + g_conf().get_val("defer_client_eviction_on_laggy_osds") + && mds->objecter->with_osdmap([](const OSDMap &map) { + return map.any_osd_laggy(); }); auto&& laggy_clients = mds->server->get_laggy_clients(); - if (!laggy_clients.empty()) { + if (defer_client_eviction && !laggy_clients.empty()) { std::vector laggy_clients_metrics; for (const auto& laggy_client: laggy_clients) { CachedStackStringStream css; *css << "Client " << laggy_client << " is laggy; not evicted" - << " because some OSD(s) is/are laggy"; + << " because some OSD(s) is/are laggy"; MDSHealthMetric m(MDS_HEALTH_CLIENTS_LAGGY, HEALTH_WARN, css->strv()); laggy_clients_metrics.emplace_back(std::move(m)); } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index b778130ee5f51..554106e29c042 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1243,6 +1243,8 @@ void Server::find_idle_sessions() kill_session(session, NULL); } } + // clear as there's no use to keep the evicted clients in laggy_clients + clear_laggy_clients(); } void Server::evict_cap_revoke_non_responders() {