Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osd/PG: fix recovery op leak #18524

Merged
merged 3 commits into from
Oct 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
105 changes: 32 additions & 73 deletions src/osd/PG.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2209,7 +2209,6 @@ void PG::start_recovery_op(const hobject_t& soid)
assert(recovery_ops_active >= 0);
recovery_ops_active++;
#ifdef DEBUG_RECOVERY_OIDS
assert(recovering_oids.count(soid) == 0);
recovering_oids.insert(soid);
#endif
osd->start_recovery_op(this, soid);
Expand All @@ -2226,7 +2225,7 @@ void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
recovery_ops_active--;
#ifdef DEBUG_RECOVERY_OIDS
assert(recovering_oids.count(soid));
recovering_oids.erase(soid);
recovering_oids.erase(recovering_oids.find(soid));
#endif
osd->finish_recovery_op(this, soid, dequeue);

Expand Down Expand Up @@ -6413,16 +6412,11 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
pg->publish_stats_to_osd();
}

boost::statechart::result
PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
void PG::RecoveryState::Backfilling::cancel_backfill()
{
PG *pg = context< RecoveryMachine >().pg;
ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);

pg->state_set(PG_STATE_BACKFILL_WAIT);
pg->state_clear(PG_STATE_BACKFILLING);

for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
++it) {
Expand All @@ -6439,8 +6433,20 @@ PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
}
}

pg->waiting_on_backfill.clear();
if (!pg->waiting_on_backfill.empty()) {
pg->waiting_on_backfill.clear();
pg->finish_recovery_op(hobject_t::get_max());
}
}

boost::statechart::result
PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
{
PG *pg = context< RecoveryMachine >().pg;
ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
pg->state_set(PG_STATE_BACKFILL_WAIT);
pg->state_clear(PG_STATE_BACKFILLING);
cancel_backfill();
pg->schedule_backfill_retry(c.delay);
return transit<NotBackfilling>();
}
Expand All @@ -6450,58 +6456,19 @@ PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
{
PG *pg = context< RecoveryMachine >().pg;
ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);

pg->state_set(PG_STATE_BACKFILL_UNFOUND);
pg->state_clear(PG_STATE_BACKFILLING);

for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
++it) {
assert(*it != pg->pg_whoami);
ConnectionRef con = pg->osd->get_con_osd_cluster(
it->osd, pg->get_osdmap()->get_epoch());
if (con) {
pg->osd->send_message_osd_cluster(
new MBackfillReserve(
MBackfillReserve::RELEASE,
spg_t(pg->info.pgid.pgid, it->shard),
pg->get_osdmap()->get_epoch()),
con.get());
}
}

pg->waiting_on_backfill.clear();

cancel_backfill();
return transit<NotBackfilling>();
}

boost::statechart::result
PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &)
{
PG *pg = context< RecoveryMachine >().pg;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
pg->state_set(PG_STATE_BACKFILL_TOOFULL);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd feel better if we cleared PG_STATE_BACKFILLING here like the other 2 cases. There is a case where we are able to get reservations, but PrimaryLogPG::do_scan() noticed that we've gone over the backfill_full_ratio while backfilling. We might as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you don't want to include that here, I'll put it in later.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated! also added more detail to the commit msg (when bug was intorduced and then duplicated)

for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
++it) {
assert(*it != pg->pg_whoami);
ConnectionRef con = pg->osd->get_con_osd_cluster(
it->osd, pg->get_osdmap()->get_epoch());
if (con) {
pg->osd->send_message_osd_cluster(
new MBackfillReserve(
MBackfillReserve::RELEASE,
spg_t(pg->info.pgid.pgid, it->shard),
pg->get_osdmap()->get_epoch()),
con.get());
}
}

pg->waiting_on_backfill.clear();
pg->finish_recovery_op(hobject_t::get_max());

pg->state_clear(PG_STATE_BACKFILLING);
cancel_backfill();
pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
return transit<NotBackfilling>();
}
Expand All @@ -6510,27 +6477,8 @@ boost::statechart::result
PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &)
{
PG *pg = context< RecoveryMachine >().pg;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
pg->state_set(PG_STATE_BACKFILL_WAIT);

for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
it != pg->backfill_targets.end();
++it) {
assert(*it != pg->pg_whoami);
ConnectionRef con = pg->osd->get_con_osd_cluster(
it->osd, pg->get_osdmap()->get_epoch());
if (con) {
pg->osd->send_message_osd_cluster(
new MBackfillReserve(
MBackfillReserve::RELEASE,
spg_t(pg->info.pgid.pgid, it->shard),
pg->get_osdmap()->get_epoch()),
con.get());
}
}

pg->waiting_on_backfill.clear();

cancel_backfill();
return transit<WaitLocalBackfillReserved>();
}

Expand Down Expand Up @@ -6594,8 +6542,7 @@ void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
}

boost::statechart::result
PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
void PG::RecoveryState::WaitRemoteBackfillReserved::retry()
{
PG *pg = context< RecoveryMachine >().pg;
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
Expand Down Expand Up @@ -6625,7 +6572,19 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
pg->publish_stats_to_osd();

pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
}

boost::statechart::result
PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
{
retry();
return transit<NotBackfilling>();
}

boost::statechart::result
PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
{
retry();
return transit<NotBackfilling>();
}

Expand Down
6 changes: 5 additions & 1 deletion src/osd/PG.h
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ class PG : public DoutPrefixProvider {
int recovery_ops_active;
set<pg_shard_t> waiting_on_backfill;
#ifdef DEBUG_RECOVERY_OIDS
set<hobject_t> recovering_oids;
multiset<hobject_t> recovering_oids;
#endif

protected:
Expand Down Expand Up @@ -2193,20 +2193,24 @@ class PG : public DoutPrefixProvider {
boost::statechart::result react(const RemoteReservationRevoked& evt);
boost::statechart::result react(const DeferBackfill& evt);
boost::statechart::result react(const UnfoundBackfill& evt);
void cancel_backfill();
void exit();
};

struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
typedef boost::mpl::list<
boost::statechart::custom_reaction< RemoteBackfillReserved >,
boost::statechart::custom_reaction< RemoteReservationRejected >,
boost::statechart::custom_reaction< RemoteReservationRevoked >,
boost::statechart::transition< AllBackfillsReserved, Backfilling >
> reactions;
set<pg_shard_t>::const_iterator backfill_osd_it;
explicit WaitRemoteBackfillReserved(my_context ctx);
void retry();
void exit();
boost::statechart::result react(const RemoteBackfillReserved& evt);
boost::statechart::result react(const RemoteReservationRejected& evt);
boost::statechart::result react(const RemoteReservationRevoked& evt);
};

struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
Expand Down