Skip to content

Commit

Permalink
Merge pull request #54120 from dparmar18/wip-63269-pacific
Browse files Browse the repository at this point in the history
pacific: mds: report clients laggy due laggy OSDs only after checking any OSD is laggy

Reviewed-by: Rishabh Dave <ridave@redhat.com>
  • Loading branch information
yuriw committed Nov 15, 2023
2 parents 2469222 + 1be4ff9 commit b6297d5
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 16 deletions.
37 changes: 23 additions & 14 deletions qa/tasks/cephfs/test_client_recovery.py
Expand Up @@ -13,6 +13,7 @@
import string
import os

from teuthology import contextutil
from teuthology.orchestra import run
from teuthology.orchestra.run import CommandFailedError
from tasks.cephfs.fuse_mount import FuseMount
Expand Down Expand Up @@ -750,24 +751,27 @@ def test_client_eviction_if_config_is_set(self):
# it takes time to have laggy clients entries in cluster log,
# wait for 6 minutes to see if it is visible, finally restart
# the client
tries = 6
while True:
try:
with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
timeout=55):
# make sure clients weren't evicted
self.assert_session_count(2)
break
except AssertionError:
tries -= 1
if tries:
continue
raise
with contextutil.safe_while(sleep=5, tries=6) as proceed:
while proceed():
try:
with self.assert_cluster_log("1 client(s) laggy due to"
" laggy OSDs",
timeout=55):
# make sure clients weren't evicted
self.assert_session_count(2)
break
except (AssertionError, CommandFailedError) as e:
log.debug(f'{e}, retrying')

# clear lagginess, expect to get the warning cleared and make sure
# client gets evicted
self.clear_laggy_params(osd)
self.wait_for_health_clear(60)
self.assert_session_count(1)
finally:
self.mount_a.kill_cleanup()
self.mount_a.mount_wait()
self.mount_a.create_destroy()
self.clear_laggy_params(osd)

def test_client_eviction_if_config_is_unset(self):
"""
Expand Down Expand Up @@ -799,6 +803,11 @@ def test_client_eviction_if_config_is_unset(self):

time.sleep(session_timeout)
self.assert_session_count(1)

# make sure warning wasn't seen in cluster log
with self.assert_cluster_log("laggy due to laggy OSDs",
timeout=120, present=False):
pass
finally:
self.mount_a.kill_cleanup()
self.mount_a.mount_wait()
Expand Down
8 changes: 6 additions & 2 deletions src/mds/Beacon.cc
Expand Up @@ -481,13 +481,17 @@ void Beacon::notify_health(MDSRank const *mds)

// Report laggy client(s) due to laggy OSDs
{
bool defer_client_eviction =
g_conf().get_val<bool>("defer_client_eviction_on_laggy_osds")
&& mds->objecter->with_osdmap([](const OSDMap &map) {
return map.any_osd_laggy(); });
auto&& laggy_clients = mds->server->get_laggy_clients();
if (!laggy_clients.empty()) {
if (defer_client_eviction && !laggy_clients.empty()) {
std::vector<MDSHealthMetric> laggy_clients_metrics;
for (const auto& laggy_client: laggy_clients) {
CachedStackStringStream css;
*css << "Client " << laggy_client << " is laggy; not evicted"
<< " because some OSD(s) is/are laggy";
<< " because some OSD(s) is/are laggy";
MDSHealthMetric m(MDS_HEALTH_CLIENTS_LAGGY, HEALTH_WARN, css->strv());
laggy_clients_metrics.emplace_back(std::move(m));
}
Expand Down
2 changes: 2 additions & 0 deletions src/mds/Server.cc
Expand Up @@ -1243,6 +1243,8 @@ void Server::find_idle_sessions()
kill_session(session, NULL);
}
}
// clear as there's no use to keep the evicted clients in laggy_clients
clear_laggy_clients();
}

void Server::evict_cap_revoke_non_responders() {
Expand Down

0 comments on commit b6297d5

Please sign in to comment.