Skip to content

Commit

Permalink
Merge pull request #24086 from batrick/i35976
Browse files Browse the repository at this point in the history
luminous: mds: configurable timeout for client eviction

Reviewed-by:  Venky Shankar <vshankar@redhat.com>
  • Loading branch information
yuriw committed Oct 3, 2018
2 parents cbfbe79 + 0014151 commit 336ae60
Show file tree
Hide file tree
Showing 13 changed files with 136 additions and 19 deletions.
6 changes: 5 additions & 1 deletion doc/cephfs/eviction.rst
Expand Up @@ -21,12 +21,16 @@ libcephfs.
Automatic client eviction
=========================

There are two situations in which a client may be evicted automatically:
There are three situations in which a client may be evicted automatically:

On an active MDS daemon, if a client has not communicated with the MDS for
over ``mds_session_autoclose`` seconds (300 seconds by default), then it
will be evicted automatically.

On an active MDS daemon, if a client has not responded to cap revoke messages
for over ``mds_cap_revoke_eviction_timeout`` (configuration option) seconds.
This is disabled by default.

During MDS startup (including on failover), the MDS passes through a
state called ``reconnect``. During this state, it waits for all the
clients to connect to the new MDS daemon. If any clients fail to do
Expand Down
1 change: 1 addition & 0 deletions qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml
Expand Up @@ -8,3 +8,4 @@ overrides:
log-whitelist:
- evicting unresponsive client
- POOL_APP_NOT_ENABLED
- has not responded to cap revoke by MDS for over
7 changes: 7 additions & 0 deletions qa/tasks/cephfs/filesystem.py
Expand Up @@ -439,6 +439,10 @@ def deactivate(self, rank):
raise RuntimeError("cannot deactivate rank 0")
self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))

def set_var(self, var, *args):
a = map(str, args)
self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)

def set_max_mds(self, max_mds):
self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)

Expand Down Expand Up @@ -558,6 +562,9 @@ def _df(self):
def get_mds_map(self):
return self.status().get_fsmap(self.id)['mdsmap']

def get_var(self, var):
return self.status().get_fsmap(self.id)['mdsmap'][var]

def add_data_pool(self, name):
self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.get_pgs_per_fs_pool().__str__())
self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name)
Expand Down
57 changes: 56 additions & 1 deletion qa/tasks/cephfs/test_misc.py
Expand Up @@ -2,11 +2,13 @@
from unittest import SkipTest
from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.orchestra.run import CommandFailedError
from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
import errno
import time
import json
import logging

log = logging.getLogger(__name__)

class TestMisc(CephFSTestCase):
CLIENTS_REQUIRED = 2
Expand Down Expand Up @@ -130,6 +132,59 @@ def test_evict_client(self):
ls_data = self.fs.mds_asok(['session', 'ls'])
self.assert_session_count(1, ls_data)

def test_cap_revoke_nonresponder(self):
"""
Check that a client is evicted if it has not responded to cap revoke
request for configured number of seconds.
"""
session_timeout = self.fs.get_var("session_timeout")
eviction_timeout = session_timeout / 2.0

self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
str(eviction_timeout)])

cap_holder = self.mount_a.open_background()

# Wait for the file to be visible from another client, indicating
# that mount_a has completed its network ops
self.mount_b.wait_for_visible()

# Simulate client death
self.mount_a.kill()

try:
# The waiter should get stuck waiting for the capability
# held on the MDS by the now-dead client A
cap_waiter = self.mount_b.write_background()

a = time.time()
time.sleep(eviction_timeout)
cap_waiter.wait()
b = time.time()
cap_waited = b - a
log.info("cap_waiter waited {0}s".format(cap_waited))

# check if the cap is transferred before session timeout kicked in.
# this is a good enough check to ensure that the client got evicted
# by the cap auto evicter rather than transitioning to stale state
# and then getting evicted.
self.assertLess(cap_waited, session_timeout,
"Capability handover took {0}, expected less than {1}".format(
cap_waited, session_timeout
))

cap_holder.stdin.close()
try:
cap_holder.wait()
except (CommandFailedError, ConnectionLostError):
# We killed it (and possibly its node), so it raises an error
pass
finally:
self.mount_a.kill_cleanup()

self.mount_a.mount()
self.mount_a.wait_until_mounted()

def test_filtered_df(self):
pool_name = self.fs.get_data_pool_name()
raw_df = self.fs.get_pool_df(pool_name)
Expand Down
4 changes: 4 additions & 0 deletions src/common/options.cc
Expand Up @@ -6500,6 +6500,10 @@ std::vector<Option> get_mds_options() {
Option("mds_request_load_average_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(60)
.set_description("rate of decay in seconds for calculating request load average"),

Option("mds_cap_revoke_eviction_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description("number of seconds after which clients which have not responded to cap revoke messages by the MDS are evicted."),
});
}

Expand Down
3 changes: 2 additions & 1 deletion src/mds/Beacon.cc
Expand Up @@ -333,7 +333,8 @@ void Beacon::notify_health(MDSRank const *mds)
// CLIENT_CAPS messages.
{
std::list<client_t> late_clients;
mds->locker->get_late_revoking_clients(&late_clients);
mds->locker->get_late_revoking_clients(&late_clients,
mds->mdsmap->get_session_timeout());
std::list<MDSHealthMetric> late_cap_metrics;

for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) {
Expand Down
21 changes: 9 additions & 12 deletions src/mds/Locker.cc
Expand Up @@ -3536,7 +3536,8 @@ void Locker::remove_client_cap(CInode *in, client_t client)
* Return true if any currently revoking caps exceed the
* mds_session_timeout threshold.
*/
bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking) const
bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking,
double timeout) const
{
xlist<Capability*>::const_iterator p = revoking.begin();
if (p.end()) {
Expand All @@ -3545,30 +3546,26 @@ bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking) const
} else {
utime_t now = ceph_clock_now();
utime_t age = now - (*p)->get_last_revoke_stamp();
if (age <= g_conf->mds_session_timeout) {
if (age <= timeout) {
return false;
} else {
return true;
}
}
}


void Locker::get_late_revoking_clients(std::list<client_t> *result) const
void Locker::get_late_revoking_clients(std::list<client_t> *result,
double timeout) const
{
if (!any_late_revoking_caps(revoking_caps)) {
if (!any_late_revoking_caps(revoking_caps, timeout)) {
// Fast path: no misbehaving clients, execute in O(1)
return;
}

// Slow path: execute in O(N_clients)
std::map<client_t, xlist<Capability*> >::const_iterator client_rc_iter;
for (client_rc_iter = revoking_caps_by_client.begin();
client_rc_iter != revoking_caps_by_client.end(); ++client_rc_iter) {
xlist<Capability*> const &client_rc = client_rc_iter->second;
bool any_late = any_late_revoking_caps(client_rc);
if (any_late) {
result->push_back(client_rc_iter->first);
for (auto &p : revoking_caps_by_client) {
if (any_late_revoking_caps(p.second, timeout)) {
result->push_back(p.first);
}
}
}
Expand Down
6 changes: 4 additions & 2 deletions src/mds/Locker.h
Expand Up @@ -186,8 +186,10 @@ class Locker {

void remove_client_cap(CInode *in, client_t client);

void get_late_revoking_clients(std::list<client_t> *result) const;
bool any_late_revoking_caps(xlist<Capability*> const &revoking) const;
void get_late_revoking_clients(std::list<client_t> *result, double timeout) const;

private:
bool any_late_revoking_caps(xlist<Capability*> const &revoking, double timeout) const;

protected:
bool _need_flush_mdlog(CInode *in, int wanted_caps);
Expand Down
1 change: 1 addition & 0 deletions src/mds/MDSDaemon.cc
Expand Up @@ -380,6 +380,7 @@ const char** MDSDaemon::get_tracked_conf_keys() const
"host",
"fsid",
"mds_request_load_average_decay_rate",
"mds_cap_revoke_eviction_timeout",
NULL
};
return KEYS;
Expand Down
2 changes: 1 addition & 1 deletion src/mds/MDSRank.cc
Expand Up @@ -31,7 +31,6 @@
#include "MDBalancer.h"
#include "Migrator.h"
#include "Locker.h"
#include "Server.h"
#include "InoTable.h"
#include "mon/MonClient.h"
#include "common/HeartbeatMap.h"
Expand Down Expand Up @@ -276,6 +275,7 @@ void MDSRankDispatcher::tick()
// ...
if (is_clientreplay() || is_active() || is_stopping()) {
server->find_idle_sessions();
server->evict_cap_revoke_non_responders();
locker->tick();
}

Expand Down
3 changes: 2 additions & 1 deletion src/mds/MDSRank.h
Expand Up @@ -31,6 +31,7 @@
#include "MDCache.h"
#include "MDLog.h"
#include "PurgeQueue.h"
#include "Server.h"
#include "osdc/Journaler.h"

// Full .h import instead of forward declaration for PerfCounter, for the
Expand Down Expand Up @@ -98,7 +99,6 @@ namespace ceph {
struct heartbeat_handle_d;
}

class Server;
class Locker;
class MDCache;
class MDLog;
Expand Down Expand Up @@ -221,6 +221,7 @@ class MDSRank {
{
mdcache->handle_conf_change(conf, changed, *mdsmap);
sessionmap.handle_conf_change(conf, changed);
server->handle_conf_change(conf, changed);
purge_queue.handle_conf_change(conf, changed, *mdsmap);
}

Expand Down
37 changes: 37 additions & 0 deletions src/mds/Server.cc
Expand Up @@ -178,6 +178,9 @@ void Server::create_logger()
plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
"Request type rename snapshot latency");

plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
"Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);

plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
"Client requests dispatched");
Expand Down Expand Up @@ -792,6 +795,40 @@ void Server::find_idle_sessions()
}
}

void Server::evict_cap_revoke_non_responders() {
if (!cap_revoke_eviction_timeout) {
return;
}

std::list<client_t> to_evict;
mds->locker->get_late_revoking_clients(&to_evict, cap_revoke_eviction_timeout);

for (auto const &client: to_evict) {
mds->clog->warn() << "client id " << client << " has not responded to"
<< " cap revoke by MDS for over " << cap_revoke_eviction_timeout
<< " seconds, evicting";
dout(1) << __func__ << ": evicting cap revoke non-responder client id "
<< client << dendl;

std::stringstream ss;
bool evicted = mds->evict_client(client.v, false,
g_conf->mds_session_blacklist_on_evict,
ss, nullptr);
if (evicted && logger) {
logger->inc(l_mdss_cap_revoke_eviction);
}
}
}

void Server::handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed) {
if (changed.count("mds_cap_revoke_eviction_timeout")) {
cap_revoke_eviction_timeout = conf->get_val<double>("mds_cap_revoke_eviction_timeout");
dout(20) << __func__ << " cap revoke eviction timeout changed to "
<< cap_revoke_eviction_timeout << dendl;
}
}

/*
* XXX bump in the interface here, not using an MDSInternalContextBase here
* because all the callers right now happen to use a SaferCond
Expand Down
7 changes: 7 additions & 0 deletions src/mds/Server.h
Expand Up @@ -66,6 +66,7 @@ enum {
l_mdss_req_setxattr_latency,
l_mdss_req_symlink_latency,
l_mdss_req_unlink_latency,
l_mdss_cap_revoke_eviction,
l_mdss_last,
};

Expand All @@ -89,6 +90,8 @@ class Server {
bool reconnect_evicting; // true if I am waiting for evictions to complete
// before proceeding to reconnect_gather_finish

double cap_revoke_eviction_timeout = 0;

friend class MDSContinuation;
friend class ServerContext;
friend class ServerLogContext;
Expand Down Expand Up @@ -311,6 +314,10 @@ class Server {
void _rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn, version_t srcdnpv,
CDentry *destdn, CDentry *staydn, bool finish_mdr);

void evict_cap_revoke_non_responders();
void handle_conf_change(const struct md_config_t *,
const std::set <std::string> &changed);

private:
void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
};
Expand Down

0 comments on commit 336ae60

Please sign in to comment.