Skip to content

Commit

Permalink
osd/PG: make scan recovery op cancellation match up reliably
Browse files Browse the repository at this point in the history
Previously, there was only one time we would end up in this region of code:
when the backfill was rejected by the peer.  Previously that was apparently
reliably when we had an outstanding SCAN request, because we would
unconditionally cancle the MAX recovery op and clear waiting_on_backfill.

See 624aaf2 for when this code appeared.

Now we have several similar paths, and we don't always have an outstanding
scan call (I don't think!).  Regardless, move most these three cases into
a common helper and make the finish_recovery_op completion conditional
on whether there is an outstanding SCAN.  This fixes a leak of a recovery
op when we defer while a scan is outstanding (this bug was recently
introduced).

Note that there is still one other time we register MAX ops: when we are
finishing backfill.  There, we start one per target.  But we will always
get back our reply and process it in the normal way (that old commit
did not change the timing for these).

Signed-off-by: Sage Weil <sage@redhat.com>
  • Loading branch information
liewegas committed Oct 25, 2017
1 parent bbba36b commit 5014783
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 49 deletions.
65 changes: 16 additions & 49 deletions src/osd/PG.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6362,16 +6362,11 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
pg->publish_stats_to_osd();
}

boost::statechart::result
PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
void PG::RecoveryState::Backfilling::cancel_backfill()
{
PG *pg = context< RecoveryMachine >().pg;
ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);

pg->state_set(PG_STATE_BACKFILL_WAIT);
pg->state_clear(PG_STATE_BACKFILLING);

for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
++it) {
Expand All @@ -6388,8 +6383,20 @@ PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
}
}

pg->waiting_on_backfill.clear();
if (!pg->waiting_on_backfill.empty()) {
pg->waiting_on_backfill.clear();
pg->finish_recovery_op(hobject_t::get_max());
}
}

boost::statechart::result
PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
{
PG *pg = context< RecoveryMachine >().pg;
ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
pg->state_set(PG_STATE_BACKFILL_WAIT);
pg->state_clear(PG_STATE_BACKFILLING);
cancel_backfill();
pg->schedule_backfill_retry(c.delay);
return transit<NotBackfilling>();
}
Expand All @@ -6399,58 +6406,18 @@ PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
{
PG *pg = context< RecoveryMachine >().pg;
ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);

pg->state_set(PG_STATE_BACKFILL_UNFOUND);
pg->state_clear(PG_STATE_BACKFILLING);

for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
++it) {
assert(*it != pg->pg_whoami);
ConnectionRef con = pg->osd->get_con_osd_cluster(
it->osd, pg->get_osdmap()->get_epoch());
if (con) {
pg->osd->send_message_osd_cluster(
new MBackfillReserve(
MBackfillReserve::CANCEL,
spg_t(pg->info.pgid.pgid, it->shard),
pg->get_osdmap()->get_epoch()),
con.get());
}
}

pg->waiting_on_backfill.clear();

cancel_backfill();
return transit<NotBackfilling>();
}

boost::statechart::result
PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
{
PG *pg = context< RecoveryMachine >().pg;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
pg->state_set(PG_STATE_BACKFILL_TOOFULL);

for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
++it) {
assert(*it != pg->pg_whoami);
ConnectionRef con = pg->osd->get_con_osd_cluster(
it->osd, pg->get_osdmap()->get_epoch());
if (con) {
pg->osd->send_message_osd_cluster(
new MBackfillReserve(
MBackfillReserve::CANCEL,
spg_t(pg->info.pgid.pgid, it->shard),
pg->get_osdmap()->get_epoch()),
con.get());
}
}

pg->waiting_on_backfill.clear();
pg->finish_recovery_op(hobject_t::get_max());

cancel_backfill();
pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
return transit<NotBackfilling>();
}
Expand Down
1 change: 1 addition & 0 deletions src/osd/PG.h
Original file line number Diff line number Diff line change
Expand Up @@ -2174,6 +2174,7 @@ class PG : public DoutPrefixProvider {
boost::statechart::result react(const RemoteReservationRejected& evt);
boost::statechart::result react(const DeferBackfill& evt);
boost::statechart::result react(const UnfoundBackfill& evt);
void cancel_backfill();
void exit();
};

Expand Down

0 comments on commit 5014783

Please sign in to comment.