Skip to content

Commit

Permalink
Merge pull request #46643 from ronen-fr/wip-rf-blocked
Browse files Browse the repository at this point in the history
scrub/osd: add clearer reminders that a scrub is blocked

Reviewed-by: Laura Flores <lflores@redhat.com>
Reviewed-by: Matan Breizman <mbreizma@redhat.com>
  • Loading branch information
ronen-fr committed Jun 23, 2022
2 parents 6ae30f6 + 996d906 commit 2f71185
Show file tree
Hide file tree
Showing 11 changed files with 190 additions and 25 deletions.
1 change: 1 addition & 0 deletions qa/suites/crimson-rados/basic/tasks/rados_api_tests.yaml
Expand Up @@ -19,6 +19,7 @@ overrides:
osd:
osd class load list: "*"
osd class default list: "*"
osd blocked scrub grace period: 3600
tasks:
- workunit:
clients:
Expand Down
1 change: 1 addition & 0 deletions qa/tasks/thrashosds-health.yaml
Expand Up @@ -3,6 +3,7 @@ overrides:
conf:
osd:
osd max markdown count: 1000
osd blocked scrub grace period: 3600
log-ignorelist:
- overall HEALTH_
- \(OSDMAP_FLAGS\)
Expand Down
8 changes: 8 additions & 0 deletions src/common/options/osd.yaml.in
Expand Up @@ -449,6 +449,14 @@ options:
see_also:
- osd_deep_scrub_large_omap_object_key_threshold
with_legacy: true
# when scrubbing blocks on a locked object
- name: osd_blocked_scrub_grace_period
type: int
level: advanced
desc: Time (seconds) before issuing a cluster-log warning
long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked.
default: 120
with_legacy: true
# where rados plugins are stored
- name: osd_class_dir
type: str
Expand Down
12 changes: 12 additions & 0 deletions src/osd/OSD.cc
Expand Up @@ -7569,6 +7569,18 @@ void OSD::sched_scrub()
{
auto& scrub_scheduler = service.get_scrub_services();

if (auto blocked_pgs = scrub_scheduler.get_blocked_pgs_count();
blocked_pgs > 0) {
// some PGs managed by this OSD were blocked by a locked object during
// scrub. This means we might not have the resources needed to scrub now.
dout(10)
<< fmt::format(
"{}: PGs are blocked while scrubbing due to locked objects ({} PGs)",
__func__,
blocked_pgs)
<< dendl;
}

// fail fast if no resources are available
if (!scrub_scheduler.can_inc_scrubs()) {
dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
Expand Down
15 changes: 11 additions & 4 deletions src/osd/osd_types.cc
Expand Up @@ -2931,10 +2931,17 @@ void pg_stat_t::dump_brief(Formatter *f) const
std::string pg_stat_t::dump_scrub_schedule() const
{
if (scrub_sched_status.m_is_active) {
return fmt::format(
"{}scrubbing for {}s",
((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
scrub_sched_status.m_duration_seconds);
// are we blocked (in fact, stuck) on some locked object?
if (scrub_sched_status.m_sched_status == pg_scrub_sched_status_t::blocked) {
return fmt::format(
"Blocked! locked objects (for {}s)",
scrub_sched_status.m_duration_seconds);
} else {
return fmt::format(
"{}scrubbing for {}s",
((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
scrub_sched_status.m_duration_seconds);
}
}
switch (scrub_sched_status.m_sched_status) {
case pg_scrub_sched_status_t::unknown:
Expand Down
3 changes: 2 additions & 1 deletion src/osd/osd_types.h
Expand Up @@ -2170,7 +2170,8 @@ enum class pg_scrub_sched_status_t : uint16_t {
not_queued, ///< not in the OSD's scrub queue. Probably not active.
active, ///< scrubbing
scheduled, ///< scheduled for a scrub at an already determined time
queued ///< queued to be scrubbed
queued, ///< queued to be scrubbed
blocked ///< blocked waiting for objects to be unlocked
};

struct pg_scrubbing_status_t {
Expand Down
19 changes: 19 additions & 0 deletions src/osd/scrubber/osd_scrub_sched.cc
Expand Up @@ -800,3 +800,22 @@ void ScrubQueue::dump_scrub_reservations(ceph::Formatter* f) const
f->dump_int("scrubs_remote", scrubs_remote);
f->dump_int("osd_max_scrubs", conf()->osd_max_scrubs);
}

void ScrubQueue::clear_pg_scrub_blocked(spg_t blocked_pg)
{
dout(5) << fmt::format(": pg {} is unblocked", blocked_pg) << dendl;
--blocked_scrubs_cnt;
ceph_assert(blocked_scrubs_cnt >= 0);
}

void ScrubQueue::mark_pg_scrub_blocked(spg_t blocked_pg)
{
dout(5) << fmt::format(": pg {} is blocked on an object", blocked_pg)
<< dendl;
++blocked_scrubs_cnt;
}

int ScrubQueue::get_blocked_pgs_count() const
{
return blocked_scrubs_cnt;
}
24 changes: 24 additions & 0 deletions src/osd/scrubber/osd_scrub_sched.h
Expand Up @@ -235,6 +235,13 @@ class ScrubQueue {
*/
std::atomic_bool updated{false};

/**
* the scrubber is waiting for locked objects to be unlocked.
* Set after a grace period has passed.
*/
bool blocked{false};
utime_t blocked_since{};

utime_t penalty_timeout{0, 0};

CephContext* cct;
Expand Down Expand Up @@ -376,6 +383,11 @@ class ScrubQueue {
void dec_scrubs_remote();
void dump_scrub_reservations(ceph::Formatter* f) const;

/// counting the number of PGs stuck while scrubbing, waiting for objects
void mark_pg_scrub_blocked(spg_t blocked_pg);
void clear_pg_scrub_blocked(spg_t blocked_pg);
int get_blocked_pgs_count() const;

/**
* Pacing the scrub operation by inserting delays (mostly between chunks)
*
Expand Down Expand Up @@ -460,6 +472,18 @@ class ScrubQueue {
int scrubs_local{0};
int scrubs_remote{0};

/**
* The scrubbing of PGs might be delayed if the scrubbed chunk of objects is
* locked by some other operation. A bug might cause this to be an infinite
* delay. If that happens, the OSDs "scrub resources" (i.e. the
* counters that limit the number of concurrent scrub operations) might
* be exhausted.
* We do issue a cluster-log warning in such occasions, but that message is
* easy to miss. The 'some pg is blocked' global flag is used to note the
* existence of such a situation in the scrub-queue log messages.
*/
std::atomic_int_fast16_t blocked_scrubs_cnt{0};

std::atomic_bool a_pg_is_reserving{false};

[[nodiscard]] bool scrub_load_below_threshold() const;
Expand Down
103 changes: 84 additions & 19 deletions src/osd/scrubber/pg_scrubber.cc
Expand Up @@ -538,6 +538,7 @@ void PgScrubber::update_scrub_job(const requested_scrub_t& request_flags)
m_pg->info,
m_pg->get_pgpool().info.opts);
m_osds->get_scrub_services().update_job(m_scrub_job, suggested);
m_pg->publish_stats_to_osd();
}

dout(15) << __func__ << " done " << registration_state() << dendl;
Expand Down Expand Up @@ -566,7 +567,6 @@ void PgScrubber::scrub_requested(scrub_level_t scrub_level,
<< dendl;

update_scrub_job(req_flags);
m_pg->publish_stats_to_osd();
}


Expand Down Expand Up @@ -775,8 +775,20 @@ bool PgScrubber::range_intersects_scrub(const hobject_t& start,

Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm()
{
int grace = get_pg_cct()->_conf->osd_blocked_scrub_grace_period;
if (grace == 0) {
// we will not be sending any alarms re the blocked object
dout(20)
<< __func__
<< ": blocked-alarm disabled ('osd_blocked_scrub_grace_period' set to 0)"
<< dendl;
return nullptr;
}
ceph::timespan grace_period{m_debug_blockrange ? 4s : seconds{grace}};
dout(30) << __func__ << ": timeout:" << grace_period.count() << dendl;
return std::make_unique<blocked_range_t>(m_osds,
ceph::timespan{300s},
grace_period,
*this,
m_pg_id);
}

Expand Down Expand Up @@ -1730,6 +1742,27 @@ bool PgScrubber::is_queued_or_active() const
return m_queued_or_active;
}

void PgScrubber::set_scrub_blocked(utime_t since)
{
ceph_assert(!m_scrub_job->blocked);
// we are called from a time-triggered lambda,
// thus - not under PG-lock
PGRef pg = m_osds->osd->lookup_lock_pg(m_pg_id);
m_osds->get_scrub_services().mark_pg_scrub_blocked(m_pg_id);
m_scrub_job->blocked_since = since;
m_scrub_job->blocked = true;
m_pg->publish_stats_to_osd();
pg->unlock();
}

void PgScrubber::clear_scrub_blocked()
{
ceph_assert(m_scrub_job->blocked);
m_osds->get_scrub_services().clear_pg_scrub_blocked(m_pg_id);
m_scrub_job->blocked = false;
m_pg->publish_stats_to_osd();
}

/*
* note: only called for the Primary.
*/
Expand Down Expand Up @@ -2029,30 +2062,52 @@ void PgScrubber::dump_active_scrubber(ceph::Formatter* f, bool is_deep) const
}
f->close_section();
}
f->dump_string("schedule", "scrubbing");
if (m_scrub_job->blocked) {
f->dump_string("schedule", "blocked");
} else {
f->dump_string("schedule", "scrubbing");
}
}

pg_scrubbing_status_t PgScrubber::get_schedule() const
{
dout(25) << __func__ << dendl;

if (!m_scrub_job) {
return pg_scrubbing_status_t{};
}

dout(25) << fmt::format("{}: active:{} blocked:{}",
__func__,
m_active,
m_scrub_job->blocked)
<< dendl;

auto now_is = ceph_clock_now();

if (m_active) {
// report current scrub info, including updated duration
int32_t duration = (utime_t{now_is} - scrub_begin_stamp).sec();

return pg_scrubbing_status_t{
utime_t{},
duration,
pg_scrub_sched_status_t::active,
true, // active
(m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
false /* is periodic? unknown, actually */};
if (m_scrub_job->blocked) {
// a bug. An object is held locked.
int32_t blocked_for =
(utime_t{now_is} - m_scrub_job->blocked_since).sec();
return pg_scrubbing_status_t{
utime_t{},
blocked_for,
pg_scrub_sched_status_t::blocked,
true, // active
(m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
false};

} else {
int32_t duration = (utime_t{now_is} - scrub_begin_stamp).sec();
return pg_scrubbing_status_t{
utime_t{},
duration,
pg_scrub_sched_status_t::active,
true, // active
(m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
false /* is periodic? unknown, actually */};
}
}
if (m_scrub_job->state != ScrubQueue::qu_state_t::registered) {
return pg_scrubbing_status_t{utime_t{},
Expand Down Expand Up @@ -2669,20 +2724,27 @@ ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)

blocked_range_t::blocked_range_t(OSDService* osds,
ceph::timespan waittime,
ScrubMachineListener& scrubber,
spg_t pg_id)
: m_osds{osds}
, m_scrubber{scrubber}
, m_pgid{pg_id}
{
auto now_is = std::chrono::system_clock::now();
m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) {
m_callbk = new LambdaContext([this, now_is]([[maybe_unused]] int r) {
std::time_t now_c = std::chrono::system_clock::to_time_t(now_is);
char buf[50];
strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c));
lgeneric_subdout(g_ceph_context, osd, 10)
<< "PgScrubber: " << pg_id << " blocked on an object for too long (since "
<< buf << ")" << dendl;
osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id
<< " blocked on an object for too long (since " << buf
<< ")";
<< "PgScrubber: " << m_pgid
<< " blocked on an object for too long (since " << buf << ")" << dendl;
m_osds->clog->warn() << "osd." << m_osds->whoami
<< " PgScrubber: " << m_pgid
<< " blocked on an object for too long (since " << buf
<< ")";

m_warning_issued = true;
m_scrubber.set_scrub_blocked(utime_t{now_c,0});
return;
});

Expand All @@ -2692,6 +2754,9 @@ blocked_range_t::blocked_range_t(OSDService* osds,

blocked_range_t::~blocked_range_t()
{
if (m_warning_issued) {
m_scrubber.clear_scrub_blocked();
}
std::lock_guard l(m_osds->sleep_lock);
m_osds->sleep_timer.cancel_event(m_callbk);
}
Expand Down
3 changes: 3 additions & 0 deletions src/osd/scrubber/pg_scrubber.h
Expand Up @@ -459,6 +459,9 @@ class PgScrubber : public ScrubPgIF,
void select_range_n_notify() final;

Scrub::BlockedRangeWarning acquire_blocked_alarm() final;
void set_scrub_blocked(utime_t since) final;
void clear_scrub_blocked() final;


/// walk the log to find the latest update that affects our chunk
eversion_t search_log_for_updates() const final;
Expand Down
26 changes: 25 additions & 1 deletion src/osd/scrubber/scrub_machine_lstnr.h
Expand Up @@ -9,6 +9,8 @@
#include "include/Context.h"
#include "osd/osd_types.h"

struct ScrubMachineListener;

namespace Scrub {

enum class PreemptionNoted { no_preemption, preempted };
Expand Down Expand Up @@ -46,11 +48,23 @@ struct preemption_t {
/// an aux used when blocking on a busy object.
/// Issues a log warning if still blocked after 'waittime'.
struct blocked_range_t {
blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id);
blocked_range_t(OSDService* osds,
ceph::timespan waittime,
ScrubMachineListener& scrubber,
spg_t pg_id);
~blocked_range_t();

OSDService* m_osds;
ScrubMachineListener& m_scrubber;

/// used to identify ourselves to the PG, when no longer blocked
spg_t m_pgid;
Context* m_callbk;

// once timed-out, we flag the OSD's scrub-queue as having
// a problem. 'm_warning_issued' signals the need to clear
// that OSD-wide flag.
bool m_warning_issued{false};
};

using BlockedRangeWarning = std::unique_ptr<blocked_range_t>;
Expand Down Expand Up @@ -173,6 +187,16 @@ struct ScrubMachineListener {
virtual void set_queued_or_active() = 0;
virtual void clear_queued_or_active() = 0;

/**
* Our scrubbing is blocked, waiting for an excessive length of time for
* our target chunk to be unlocked. We will set the corresponding flags,
* both in the OSD_wide scrub-queue object, and in our own scrub-job object.
* Both flags are used to report the unhealthy state in the log and in
* response to scrub-queue queries.
*/
virtual void set_scrub_blocked(utime_t since) = 0;
virtual void clear_scrub_blocked() = 0;

/**
* the FSM interface into the "are we waiting for maps, either our own or from
* replicas" state.
Expand Down

0 comments on commit 2f71185

Please sign in to comment.