Skip to content

Commit

Permalink
mgr: implement 'osd probably-ok-to-stop' command
Browse files Browse the repository at this point in the history
This is an approximate check whether stopping the set of OSDs
concurrently will result in PGs becoming unavailable (by dropping
below min_size).

Signed-off-by: Sage Weil <sage@redhat.com>
  • Loading branch information
liewegas committed Aug 10, 2017
1 parent 420038e commit 15d9094
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
72 changes: 72 additions & 0 deletions src/mgr/DaemonServer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,78 @@ bool DaemonServer::handle_command(MCommand *m)
ss << "osd." << osd << " is safe to destroy";
cmdctx->reply(0, ss);
return true;
} else if (prefix == "osd probably-ok-to-stop") {
vector<string> ids;
cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids);
set<int> osds;
int r;
cluster_state.with_osdmap([&](const OSDMap& osdmap) {
r = osdmap.parse_osd_id_list(ids, &osds, &ss);
});
if (r < 0) {
cmdctx->reply(r, ss);
return true;
}
map<pg_t,int> pg_delta; // pgid -> net acting set size change
int dangerous_pgs = 0;
cluster_state.with_pgmap([&](const PGMap& pg_map) {
return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
if (pg_map.num_pg_unknown > 0) {
ss << pg_map.num_pg_unknown << " pgs have unknown state; "
<< "cannot draw any conclusions";
r = -EAGAIN;
return;
}
for (auto osd : osds) {
auto p = pg_map.pg_by_osd.find(osd);
if (p != pg_map.pg_by_osd.end()) {
for (auto& pgid : p->second) {
--pg_delta[pgid];
}
}
}
for (auto& p : pg_delta) {
auto q = pg_map.pg_stat.find(p.first);
if (q == pg_map.pg_stat.end()) {
ss << "missing information about " << p.first << "; cannot draw"
<< " any conclusions";
r = -EAGAIN;
return;
}
if (!(q->second.state & PG_STATE_ACTIVE) ||
(q->second.state & PG_STATE_DEGRADED)) {
// we don't currently have a good way to tell *how* degraded
// a degraded PG is, so we have to assume we cannot remove
// any more replicas/shards.
++dangerous_pgs;
return;
}
const pg_pool_t *pi = osdmap.get_pg_pool(p.first.pool());
if (!pi) {
// pool deleted?
} else {
if (q->second.acting.size() + p.second < pi->min_size) {
++dangerous_pgs;
}
}
}
});
});
if (r) {
cmdctx->reply(r, ss);
return true;
}
if (dangerous_pgs) {
ss << dangerous_pgs << " are already degraded or might become unavailable";
cmdctx->reply(-EBUSY, ss);
return true;
}
ss << "OSD(s) " << osds << " are *probably* ok to stop without reducing "
<< "availability, provided there are no other concurrent failures "
<< "or interventions. " << pg_delta.size() << " PGs are likely to be "
<< "degraded (but remain available) as a result.";
cmdctx->reply(0, ss);
return true;
} else if (prefix == "pg force-recovery" ||
prefix == "pg force-backfill" ||
prefix == "pg cancel-force-recovery" ||
Expand Down
3 changes: 3 additions & 0 deletions src/mgr/MgrCommands.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ COMMAND("osd test-reweight-by-pg " \
COMMAND("osd safe-to-destroy name=id,type=CephOsdName",
"check whether osd.<id> can be safely destroyed without risking data loss",
"osd", "r", "cli,rest")
COMMAND("osd probably-ok-to-stop name=ids,type=CephString,n=N",
"check whether osd(s) can be safely stopped without reducing availability",
"osd", "r", "cli,rest")

COMMAND("osd scrub " \
"name=who,type=CephString", \
Expand Down

0 comments on commit 15d9094

Please sign in to comment.