Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
MDSMonitor: handle damaged from standby-replay
This change does a few things:

- if a state transition is invalid or a beacon is garbage, the
  MDSMonitor now evicts the MDS instead of ignoring the problem.

- standby state validation is moved to prepare_beacon where eviction can
  happen.

- standby-replay may indicate the rank is damaged (failure to replay the
  journal).

- if the rank is damaged, both the rank holder and standby-replay daemon
  (if any) will be removed.

Fixes: https://tracker.ceph.com/issues/52565
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
  • Loading branch information
batrick committed Sep 9, 2021
1 parent 391f3c8 commit 20509bb
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 60 deletions.
3 changes: 3 additions & 0 deletions PendingReleaseNotes
Expand Up @@ -43,6 +43,9 @@

* OSD: Ceph now uses mclock_scheduler as its default osd_op_queue to provide QoS.

* CephFS: Failure to replay the journal by a standby-replay daemon will now
cause the rank to be marked damaged.

* RGW: S3 bucket notification events now contain an `eTag` key instead of `etag`,
and eventName values no longer carry the `s3:` prefix, fixing deviations from
the message format observed on AWS.
Expand Down
6 changes: 6 additions & 0 deletions src/mds/FSMap.h
Expand Up @@ -523,8 +523,14 @@ class FSMap {
bool filesystem_exists(fs_cluster_id_t fscid) const {return filesystems.count(fscid) > 0;}
Filesystem::const_ref get_filesystem(fs_cluster_id_t fscid) const {return std::const_pointer_cast<const Filesystem>(filesystems.at(fscid));}
Filesystem::ref get_filesystem(fs_cluster_id_t fscid) {return filesystems.at(fscid);}
Filesystem::ref get_filesystem(mds_gid_t gid) {
return filesystems.at(mds_roles.at(gid));
}
Filesystem::const_ref get_filesystem(void) const {return std::const_pointer_cast<const Filesystem>(filesystems.begin()->second);}
Filesystem::const_ref get_filesystem(std::string_view name) const;
Filesystem::const_ref get_filesystem(mds_gid_t gid) const {
return filesystems.at(mds_roles.at(gid));
}

std::vector<Filesystem::const_ref> get_filesystems(void) const;

Expand Down
3 changes: 3 additions & 0 deletions src/mds/MDSMap.h
Expand Up @@ -446,6 +446,9 @@ class MDSMap {
return get_state_gid(it->second);
}

auto get_gid(mds_rank_t r) const {
return up.at(r);
}
const auto& get_info(mds_rank_t m) const {
return mds_info.at(up.at(m));
}
Expand Down
125 changes: 65 additions & 60 deletions src/mon/MDSMonitor.cc
Expand Up @@ -473,23 +473,6 @@ bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)

// is there a state change here?
if (info.state != state) {
// legal state change?
if ((info.state == MDSMap::STATE_STANDBY ||
info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
<< " -> " << ceph_mds_state_name(state) << ")" << dendl;
goto reply;
}

if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
&& info.rank != MDS_RANK_NONE)
{
dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
"held rank " << info.rank << " while requesting state "
<< ceph_mds_state_name(state) << dendl;
goto reply;
}

_note_beacon(m);
return false;
}
Expand Down Expand Up @@ -691,15 +674,7 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
* know which FS it was part of. Nor does this matter. Sending an empty
* MDSMap is sufficient for getting the MDS to respawn.
*/
wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
if (r >= 0) {
auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
mon.send_reply(op, m.detach());
} else {
dispatch(op); // try again
}
}));
return true;
goto null;
}

const auto& info = pending.get_info_gid(gid);
Expand All @@ -716,14 +691,27 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
return true;
}

if (info.state == MDSMap::STATE_STOPPING &&
// legal state change?
if ((info.state == MDSMap::STATE_STANDBY && state > 0) ||
(info.state == MDSMap::STATE_STANDBY_REPLAY && state > 0 && state != MDSMap::STATE_DAMAGED)) {
/* N.B.: standby-replay can indicate the rank is damaged due to failure to replay */
dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
<< " -> " << ceph_mds_state_name(state) << ")" << dendl;
goto evict;
} else if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
&& info.rank != MDS_RANK_NONE)
{
dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
"held rank " << info.rank << " while requesting state "
<< ceph_mds_state_name(state) << dendl;
goto evict;
} else if (info.state == MDSMap::STATE_STOPPING &&
state != MDSMap::STATE_STOPPING &&
state != MDSMap::STATE_STOPPED) {
// we can't transition to any other states from STOPPING
dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
<< dendl;
_note_beacon(m);
return true;
goto evict;
}

if (info.laggy()) {
Expand Down Expand Up @@ -770,8 +758,6 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
pending_daemon_health_rm.insert(erased_gid);
}
}


} else if (state == MDSMap::STATE_DAMAGED) {
if (!mon.osdmon()->is_writeable()) {
dout(1) << __func__ << ": DAMAGED from rank " << info.rank
Expand All @@ -780,55 +766,48 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
return false;
}

auto rank = info.rank;

// Record this MDS rank as damaged, so that other daemons
// won't try to run it.
dout(0) << __func__ << ": marking rank "
<< info.rank << " damaged" << dendl;
dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;

auto fs = pending.get_filesystem(gid);
auto rankgid = fs->mds_map.get_gid(rank);
auto rankinfo = pending.get_info_gid(rankgid);
auto followergid = fs->mds_map.get_standby_replay(rank);

ceph_assert(gid == rankgid || gid == followergid);

utime_t until = ceph_clock_now();
until += g_conf().get_val<double>("mon_mds_blocklist_interval");
const auto blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
request_proposal(mon.osdmon());
pending.damaged(gid, blocklist_epoch);
last_beacon.erase(gid);

// Respond to MDS, so that it knows it can continue to shut down
auto beacon = make_message<MMDSBeacon>(
mon.monmap->fsid, m->get_global_id(),
m->get_name(), pending.get_epoch(), state, seq,
CEPH_FEATURES_SUPPORTED_DEFAULT);
mon.send_reply(op, beacon.detach());
} else if (state == MDSMap::STATE_DNE) {
if (!mon.osdmon()->is_writeable()) {
dout(1) << __func__ << ": DNE from rank " << info.rank
<< " waiting for osdmon writeable to blocklist it" << dendl;
mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
if (followergid != MDS_GID_NONE) {
fail_mds_gid(pending, followergid);
last_beacon.erase(followergid);
}

fail_mds_gid(pending, gid);
ceph_assert(mon.osdmon()->is_writeable());
request_proposal(mon.osdmon());
pending.damaged(rankgid, blocklist_epoch);
last_beacon.erase(rankgid);

// Respond to MDS, so that it knows it can continue to shut down
auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq,
CEPH_FEATURES_SUPPORTED_DEFAULT);
mon.send_reply(op, beacon.detach());
/* MDS expects beacon reply back */
} else if (state == MDSMap::STATE_DNE) {
dout(1) << __func__ << ": DNE from " << info << dendl;
goto evict;
} else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
// Standby daemons should never modify their own
// state. Reject any attempts to do so.
derr << "standby " << gid << " attempted to change state to "
<< ceph_mds_state_name(state) << ", rejecting" << dendl;
return true;
goto evict;
} else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
!MDSMap::state_transition_valid(info.state, state)) {
// Validate state transitions for daemons that hold a rank
derr << "daemon " << gid << " (rank " << info.rank << ") "
<< "reported invalid state transition "
<< ceph_mds_state_name(info.state) << " -> "
<< ceph_mds_state_name(state) << dendl;
return true;
goto evict;
} else {
if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
const auto &fscid = pending.mds_roles.at(gid);
Expand Down Expand Up @@ -860,6 +839,32 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
}
}));

return true;

evict:
if (!mon.osdmon()->is_writeable()) {
dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
}

fail_mds_gid(pending, gid);
request_proposal(mon.osdmon());
dout(5) << __func__ << ": pending map now:" << dendl;
print_map(pending);

goto null;

null:
wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
if (r >= 0) {
auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
mon.send_reply(op, m.detach());
} else {
dispatch(op); // try again
}
}));

return true;
}

Expand Down

0 comments on commit 20509bb

Please sign in to comment.