Skip to content

Commit

Permalink
Merge pull request #12074 from batrick/i17604
Browse files Browse the repository at this point in the history
mds: warn if insufficient standbys exist

Reviewed-by: John Spray <john.spray@redhat.com>
  • Loading branch information
John Spray committed Mar 28, 2017
2 parents 4c8ec8a + 7278543 commit d495900
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 10 deletions.
9 changes: 9 additions & 0 deletions doc/cephfs/health-messages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ they are supposed to send beacon messages every ``mds_beacon_interval``
(default 4s). The daemons may have crashed. The Ceph monitor will
automatically replace laggy daemons with standbys if any are available.

Message: insufficient standby daemons available
Description: One or more file systems are configured to have a certain number
of standby daemons available (including daemons in standby-replay) but the
cluster does not have enough standby daemons. The standby deamons not in replay
count towards any file system (i.e. they may overlap). This warning can
configured by setting ``ceph fs set <fs> standby_count_wanted <count>``. Use
zero for ``count`` to disable.


Daemon-reported health checks
=============================

Expand Down
14 changes: 14 additions & 0 deletions doc/cephfs/standby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,20 @@ If an MDS daemon stops communicating with the monitor, the monitor will
wait ``mds_beacon_grace`` seconds (default 15 seconds) before marking
the daemon as *laggy*.

Each file system may specify a number of standby daemons to be considered
healthy. This number includes daemons in standby-replay waiting for a rank to
fail (remember that a standby-replay daemon will not be assigned to take over a
failure for another rank or a failure in a another CephFS file system). The
pool of standby daemons not in replay count towards any file system count.
Each file system may set the number of standby daemons wanted using:

::

ceph fs set <fs name> standby_count_wanted <count>

Setting ``count`` to 0 will disable the health check.


Configuring standby daemons
---------------------------

Expand Down
23 changes: 21 additions & 2 deletions src/mds/FSMap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid)
new_fs->mds_map.modified = ceph_clock_now();
new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
new_fs->mds_map.enabled = true;

// Persist the new FSMap
Expand All @@ -300,13 +301,31 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid)
void FSMap::get_health(list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail) const
{
for (auto i : filesystems) {
auto fs = i.second;
mds_rank_t standby_count_wanted = 0;
for (const auto &i : filesystems) {
const auto &fs = i.second;

// TODO: move get_health up into here so that we can qualify
// all the messages with what filesystem they're talking about
fs->mds_map.get_health(summary, detail);

standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
}

if (standby_count_wanted) {
std::ostringstream oss;
oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
summary.push_back(make_pair(HEALTH_WARN, oss.str()));
}
}

bool FSMap::check_health(void)
{
bool changed = false;
for (auto &i : filesystems) {
changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
}
return changed;
}

void FSMap::encode(bufferlist& bl, uint64_t features) const
Expand Down
2 changes: 2 additions & 0 deletions src/mds/FSMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,8 @@ class FSMap {
void get_health(list<pair<health_status_t,std::string> >& summary,
list<pair<health_status_t,std::string> > *detail) const;

bool check_health(void);

/**
* Assert that the FSMap, Filesystem, MDSMap, mds_info_t relations are
* all self-consistent.
Expand Down
41 changes: 34 additions & 7 deletions src/mds/MDSMap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ void MDSMap::dump(Formatter *f) const
f->dump_bool("enabled", enabled);
f->dump_string("fs_name", fs_name);
f->dump_string("balancer", balancer);
f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
}

void MDSMap::generate_test_instances(list<MDSMap*>& ls)
Expand Down Expand Up @@ -228,6 +229,7 @@ void MDSMap::print(ostream& out) const
out << "metadata_pool\t" << metadata_pool << "\n";
out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
out << "balancer\t" << balancer << "\n";
out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";

multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
for (const auto &p : mds_info) {
Expand Down Expand Up @@ -374,14 +376,12 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
}
}

map<mds_rank_t, mds_gid_t>::const_iterator u = up.begin();
map<mds_rank_t, mds_gid_t>::const_iterator u_end = up.end();
map<mds_gid_t, mds_info_t>::const_iterator m_end = mds_info.end();
set<string> laggy;
for (; u != u_end; ++u) {
map<mds_gid_t, mds_info_t>::const_iterator m = mds_info.find(u->second);
for (const auto &u : up) {
map<mds_gid_t, mds_info_t>::const_iterator m = mds_info.find(u.second);
if (m == m_end) {
std::cerr << "Up rank " << u->first << " GID " << u->second << " not found!" << std::endl;
std::cerr << "Up rank " << u.first << " GID " << u.second << " not found!" << std::endl;
}
assert(m != m_end);
const mds_info_t &mds_info(m->second);
Expand Down Expand Up @@ -558,7 +558,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
::encode(cas_pool, bl);

// kclient ignores everything from here
__u16 ev = 11;
__u16 ev = 12;
::encode(ev, bl);
::encode(compat, bl);
::encode(metadata_pool, bl);
Expand All @@ -578,6 +578,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
::encode(fs_name, bl);
::encode(damaged, bl);
::encode(balancer, bl);
::encode(standby_count_wanted, bl);
ENCODE_FINISH(bl);
}

Expand Down Expand Up @@ -684,8 +685,13 @@ void MDSMap::decode(bufferlist::iterator& p)
}

if (ev >= 11) {
::decode(balancer, p);
::decode(balancer, p);
}

if (ev >= 12) {
::decode(standby_count_wanted, p);
}

DECODE_FINISH(p);
}

Expand Down Expand Up @@ -760,3 +766,24 @@ bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next)

return state_valid;
}

bool MDSMap::check_health(mds_rank_t standby_daemon_count)
{
std::set<mds_rank_t> standbys;
get_standby_replay_mds_set(standbys);
std::set<mds_rank_t> actives;
get_active_mds_set(actives);
mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count;

/* If there are standby daemons available/replaying and
* standby_count_wanted is unset (default), then we set it to 1. This will
* happen during health checks by the mons. Also, during initial creation
* of the FS we will have no actives so we don't want to change the default
* yet.
*/
if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) {
set_standby_count_wanted(1);
return true;
}
return false;
}
16 changes: 16 additions & 0 deletions src/mds/MDSMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ class MDSMap {
*/

mds_rank_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */
mds_rank_t standby_count_wanted;
string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */

std::set<mds_rank_t> in; // currently defined cluster
Expand Down Expand Up @@ -228,6 +229,7 @@ class MDSMap {
cas_pool(-1),
metadata_pool(-1),
max_mds(0),
standby_count_wanted(-1),
ever_allowed_features(0),
explicitly_allowed_features(0),
inline_data_enabled(false),
Expand Down Expand Up @@ -290,6 +292,17 @@ class MDSMap {
mds_rank_t get_max_mds() const { return max_mds; }
void set_max_mds(mds_rank_t m) { max_mds = m; }

mds_rank_t get_standby_count_wanted(mds_rank_t standby_daemon_count) const {
assert(standby_daemon_count >= 0);
std::set<mds_rank_t> s;
get_standby_replay_mds_set(s);
mds_rank_t standbys_avail = (mds_rank_t)s.size()+standby_daemon_count;
mds_rank_t wanted = std::max(0, standby_count_wanted);
return wanted > standbys_avail ? wanted - standbys_avail : 0;
}
void set_standby_count_wanted(mds_rank_t n) { standby_count_wanted = n; }
bool check_health(mds_rank_t standby_daemon_count);

const std::string get_balancer() const { return balancer; }
void set_balancer(std::string val) { balancer.assign(val); }

Expand Down Expand Up @@ -370,6 +383,9 @@ class MDSMap {
void get_active_mds_set(std::set<mds_rank_t>& s) const {
get_mds_set(s, MDSMap::STATE_ACTIVE);
}
void get_standby_replay_mds_set(std::set<mds_rank_t>& s) const {
get_mds_set(s, MDSMap::STATE_STANDBY_REPLAY);
}
void get_failed_mds_set(std::set<mds_rank_t>& s) const {
s = failed;
}
Expand Down
15 changes: 15 additions & 0 deletions src/mon/FSCommands.cc
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,21 @@ class SetHandler : public FileSystemCommandHandler
});

ss << "marked " << (is_down ? "down" : "up");
} else if (var == "standby_count_wanted") {
if (interr.length()) {
ss << var << " requires an integer value";
return -EINVAL;
}
if (n < 0) {
ss << var << " must be non-negative";
return -ERANGE;
}
fsmap.modify_filesystem(
fs->fscid,
[n](std::shared_ptr<Filesystem> fs)
{
fs->mds_map.set_standby_count_wanted(n);
});
} else {
ss << "unknown variable " << var;
return -EINVAL;
Expand Down
2 changes: 2 additions & 0 deletions src/mon/MDSMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2065,6 +2065,8 @@ void MDSMonitor::tick()

if (!mon->is_leader()) return;

do_propose |= pending_fsmap.check_health();

// expand mds cluster (add new nodes to @in)?
for (auto i : pending_fsmap.filesystems) {
do_propose |= maybe_expand_cluster(i.second);
Expand Down
3 changes: 2 additions & 1 deletion src/mon/MonCommands.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,8 @@ COMMAND("fs get name=fs_name,type=CephString", \
COMMAND("fs set " \
"name=fs_name,type=CephString " \
"name=var,type=CephChoices,strings=max_mds|max_file_size"
"|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer " \
"|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer" \
"|standby_count_wanted " \
"name=val,type=CephString " \
"name=confirm,type=CephString,req=false", \
"set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
Expand Down

0 comments on commit d495900

Please sign in to comment.