Skip to content

Commit

Permalink
osd: automatically repair replicated replica on pulling error
Browse files Browse the repository at this point in the history
However this is not a very complete solution since the broken object
info may still get lost if we switch primaries or simply power off nodes.
I think a better idea would be also adding these kind of broken objects
back into replica's own missing set simultaneously, e.g., like we handling
primary reading errors.

But for now I am not sure if that should be a concern?

Fixes: http://tracker.ceph.com/issues/39101
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit 202606c)
  • Loading branch information
xiexingguo authored and Prashant D committed Apr 22, 2019
1 parent 7f220e2 commit f56326e
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 6 deletions.
4 changes: 3 additions & 1 deletion src/osd/PGBackend.h
Expand Up @@ -105,7 +105,9 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
pg_shard_t peer,
const hobject_t oid) = 0;

virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0;
virtual void failed_push(const list<pg_shard_t> &from,
const hobject_t &soid,
const eversion_t &need = eversion_t()) = 0;
virtual void finish_degraded_object(const hobject_t& oid) = 0;
virtual void primary_failed(const hobject_t &soid) = 0;
virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0;
Expand Down
13 changes: 11 additions & 2 deletions src/osd/PrimaryLogPG.cc
Expand Up @@ -11694,7 +11694,8 @@ void PrimaryLogPG::primary_failed(const hobject_t &soid)
failed_push(fl, soid);
}

void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
void PrimaryLogPG::failed_push(const list<pg_shard_t> &from,
const hobject_t &soid, const eversion_t &need)
{
dout(20) << __func__ << ": " << soid << dendl;
ceph_assert(recovering.count(soid));
Expand All @@ -11705,8 +11706,16 @@ void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &so
requeue_ops(blocked_ops);
}
recovering.erase(soid);
for (auto&& i : from)
for (auto&& i : from) {
missing_loc.remove_location(soid, i);
if (need != eversion_t()) {
dout(0) << __func__ << " adding " << soid << " to shard " << i
<< "'s missing set too" << dendl;
auto pm = peer_missing.find(i);
if (pm != peer_missing.end())
pm->second.add(soid, need, eversion_t(), false);
}
}
dout(0) << __func__ << " " << soid << " from shard " << from
<< ", reps on " << missing_loc.get_locations(soid)
<< " unfound? " << missing_loc.is_unfound(soid) << dendl;
Expand Down
4 changes: 3 additions & 1 deletion src/osd/PrimaryLogPG.h
Expand Up @@ -290,7 +290,9 @@ class PrimaryLogPG : public PG, public PGBackend::Listener {
const hobject_t &oid,
const object_stat_sum_t &stat_diff,
bool is_delete) override;
void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) override;
void failed_push(const list<pg_shard_t> &from,
const hobject_t &soid,
const eversion_t &need = eversion_t()) override;
void primary_failed(const hobject_t &soid) override;
bool primary_error(const hobject_t& soid, eversion_t v) override;
void cancel_pull(const hobject_t &soid) override;
Expand Down
6 changes: 4 additions & 2 deletions src/osd/ReplicatedBackend.cc
Expand Up @@ -2182,9 +2182,11 @@ void ReplicatedBackend::_failed_pull(pg_shard_t from, const hobject_t &soid)
{
dout(20) << __func__ << ": " << soid << " from " << from << dendl;
list<pg_shard_t> fl = { from };
get_parent()->failed_push(fl, soid);
auto it = pulling.find(soid);
assert(it != pulling.end());
get_parent()->failed_push(fl, soid, it->second.recovery_info.version);

clear_pull(pulling.find(soid));
clear_pull(it);
}

void ReplicatedBackend::clear_pull_from(
Expand Down

0 comments on commit f56326e

Please sign in to comment.