Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osd: randomize deep scrubbing #6550

Merged
merged 2 commits into from Nov 20, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions src/common/config_opts.h
Expand Up @@ -684,6 +684,7 @@ OPTION(osd_scrub_sleep, OPT_FLOAT, 0) // sleep between [deep]scrub ops
OPTION(osd_scrub_auto_repair, OPT_BOOL, false) // whether auto-repair inconsistencies upon deep-scrubbing
OPTION(osd_scrub_auto_repair_num_errors, OPT_U32, 5) // only auto-repair when number of errors is below this threshold
OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT, 0.15) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT, 2*60*60) // objects must be this old (seconds) before we update the whole-object digest on scrub
OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
Expand Down
43 changes: 34 additions & 9 deletions src/osd/OSD.cc
Expand Up @@ -1809,6 +1809,15 @@ int OSD::init()

dout(2) << "boot" << dendl;

// initialize the daily loadavg with current 15min loadavg
double loadavgs[3];
if (getloadavg(loadavgs, 3) == 3) {
daily_loadavg = loadavgs[2];
} else {
derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
daily_loadavg = 1.0;
}

// read superblock
r = read_superblock();
if (r < 0) {
Expand Down Expand Up @@ -3862,8 +3871,12 @@ void OSD::heartbeat()

// get CPU load avg
double loadavgs[1];
if (getloadavg(loadavgs, 1) == 1)
int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
if (getloadavg(loadavgs, 1) == 1) {
logger->set(l_osd_loadavg, 100 * loadavgs[0]);
daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
}

dout(30) << "heartbeat checking stats" << dendl;

Expand Down Expand Up @@ -6065,23 +6078,35 @@ bool OSD::scrub_time_permit(utime_t now)

bool OSD::scrub_load_below_threshold()
{
double loadavgs[1];
if (getloadavg(loadavgs, 1) != 1) {
double loadavgs[3];
if (getloadavg(loadavgs, 3) != 3) {
dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
return false;
}

if (loadavgs[0] >= cct->_conf->osd_scrub_load_threshold) {
dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " >= max " << cct->_conf->osd_scrub_load_threshold
<< " = no, load too high" << dendl;
return false;
} else {
// allow scrub if below configured threshold
if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " < max " << cct->_conf->osd_scrub_load_threshold
<< " = yes" << dendl;
return true;
}

// allow scrub if below daily avg and currently decreasing
if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " < daily_loadavg " << daily_loadavg
<< " and < 15m avg " << loadavgs[2]
<< " = yes" << dendl;
return true;
}

dout(20) << __func__ << " loadavg " << loadavgs[0]
<< " >= max " << cct->_conf->osd_scrub_load_threshold
<< " and ( >= daily_loadavg " << daily_loadavg
<< " or >= 15m avg " << loadavgs[2]
<< ") = no" << dendl;
return false;
}

void OSD::sched_scrub()
Expand Down
1 change: 1 addition & 0 deletions src/osd/OSD.h
Expand Up @@ -1505,6 +1505,7 @@ class OSD : public Dispatcher,
Messenger *hb_front_server_messenger;
Messenger *hb_back_server_messenger;
utime_t last_heartbeat_resample; ///< last time we chose random peers in waiting-for-healthy state
double daily_loadavg;

void _add_heartbeat_peer(int p);
void _remove_heartbeat_peer(int p);
Expand Down
5 changes: 5 additions & 0 deletions src/osd/PG.cc
Expand Up @@ -3246,6 +3246,11 @@ bool PG::sched_scrub()
bool time_for_deep = (ceph_clock_now(cct) >=
info.history.last_deep_scrub_stamp + cct->_conf->osd_deep_scrub_interval);

bool deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;

time_for_deep = (time_for_deep || deep_coin_flip);

//NODEEP_SCRUB so ignore time initiated deep-scrub
if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB))
Expand Down