Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mon/PGMap: call blocked requests ERR not WARN #15501

Merged
merged 1 commit into from Jun 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions PendingReleaseNotes
Expand Up @@ -137,3 +137,13 @@
to zero will effectively disable the health check.
* The "ceph mds tell ..." command has been removed. It is superceded
by "ceph tell mds.<id> ..."

12.1.0
------

* The ``mon_osd_max_op_age`` option has been renamed to
``mon_osd_warn_op_age`` (default: 32 seconds), to indicate we
generate a warning at this age. There is also a new
``mon_osd_err_op_age_ratio`` that is a expressed as a multitple of
``mon_osd_warn_op_age`` (default: 128, for roughly 60 minutes) to
control when an error is generated.
3 changes: 2 additions & 1 deletion src/common/config_opts.h
Expand Up @@ -288,7 +288,8 @@ OPTION(mon_osd_down_out_interval, OPT_INT, 600) // seconds
OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out
OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down
OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .75) // min osds required to be in to mark things out
OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get concerned (make it a power of 2)
OPTION(mon_osd_warn_op_age, OPT_DOUBLE, 32) // max op age before we generate a warning (make it a power of 2)
OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE, 128) // when to generate an error, as multiple of mon_osd_warn_op_age
OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be set in the osdmap
OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false) // allow primary_affinity to be set in the osdmap
Expand Down
76 changes: 53 additions & 23 deletions src/mon/PGMap.cc
Expand Up @@ -2431,31 +2431,39 @@ static void note_stuck_detail(
}
}

static int _warn_slow_request_histogram(
static pair<int,int> _warn_slow_request_histogram(
CephContext *cct,
const pow2_hist_t& h,
string suffix,
list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail)
{
if (h.h.empty())
return 0;
return make_pair(0, 0);

unsigned sum = 0;
unsigned warn = 0, error = 0;
float err_age =
cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
for (unsigned i = h.h.size() - 1; i > 0; --i) {
float ub = (float)(1 << i) / 1000.0;
if (ub < cct->_conf->mon_osd_max_op_age)
if (ub < cct->_conf->mon_osd_warn_op_age)
break;
if (h.h[i]) {
auto sev = HEALTH_WARN;
if (ub > err_age) {
sev = HEALTH_ERR;
error += h.h[i];
} else {
warn += h.h[i];
}
if (detail) {
ostringstream ss;
ss << h.h[i] << " ops are blocked > " << ub << " sec" << suffix;
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
detail->push_back(make_pair(sev, ss.str()));
}
sum += h.h[i];
}
}
return sum;
return make_pair(warn, error);
}

namespace {
Expand Down Expand Up @@ -2708,33 +2716,55 @@ void PGMap::get_health(
}

// slow requests
if (cct->_conf->mon_osd_max_op_age > 0 &&
osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_max_op_age) {
unsigned sum = _warn_slow_request_histogram(
if (cct->_conf->mon_osd_warn_op_age > 0 &&
osd_sum.op_queue_age_hist.upper_bound() > cct->_conf->mon_osd_warn_op_age) {
auto sum = _warn_slow_request_histogram(
cct, osd_sum.op_queue_age_hist, "", summary, NULL);
if (sum > 0) {
ostringstream ss;
ss << sum << " requests are blocked > " << cct->_conf->mon_osd_max_op_age
<< " sec";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (sum.first > 0 || sum.second > 0) {
if (sum.first > 0) {
ostringstream ss;
ss << sum.first << " requests are blocked > "
<< cct->_conf->mon_osd_warn_op_age
<< " sec";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
}
if (sum.second > 0) {
ostringstream ss;
ss << sum.first << " requests are blocked > "
<< (cct->_conf->mon_osd_warn_op_age *
cct->_conf->mon_osd_err_op_age_ratio)
<< " sec";
summary.push_back(make_pair(HEALTH_ERR, ss.str()));
}

if (detail) {
unsigned num_slow_osds = 0;
unsigned num_warn = 0, num_err = 0;
// do per-osd warnings
for (auto p = osd_stat.begin();
p != osd_stat.end();
++p) {
if (_warn_slow_request_histogram(
auto sum = _warn_slow_request_histogram(
cct,
p->second.op_queue_age_hist,
string(" on osd.") + stringify(p->first),
summary, detail))
++num_slow_osds;
summary, detail);
if (sum.second)
++num_err;
else if (sum.first)
++num_warn;
}
if (num_err) {
ostringstream ss2;
ss2 << num_err << " osds have very slow requests";
summary.push_back(make_pair(HEALTH_ERR, ss2.str()));
detail->push_back(make_pair(HEALTH_ERR, ss2.str()));
}
if (num_warn) {
ostringstream ss2;
ss2 << num_err << " osds have slow requests";
summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
}
ostringstream ss2;
ss2 << num_slow_osds << " osds have slow requests";
summary.push_back(make_pair(HEALTH_WARN, ss2.str()));
detail->push_back(make_pair(HEALTH_WARN, ss2.str()));
}
}
}
Expand Down