Skip to content

Commit

Permalink
osd: mon: account for stretch bucket configs/changes when detecting i…
Browse files Browse the repository at this point in the history
…ntervals

Factor out the logic we wrote in PeeringState::choose_acting into a new
pg_pool_t::stretch_set_can_peer(), and use it in
PastIntervals::check_new_interval(). Should have accounted for this
when we first set it -- whoops!

Set last_force_resend in the OSDMap when we change values, in order
to make old clients do the right thing. The OSDs and new clients
will detect changes directly by looking at the various crush bucket
values in is_new_interval().

Signed-off-by: Greg Farnum <gfarnum@redhat.com>
  • Loading branch information
gregsfortytwo committed Jul 20, 2020
1 parent 0f49616 commit 97fbd19
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 20 deletions.
4 changes: 3 additions & 1 deletion src/mon/OSDMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14383,6 +14383,7 @@ void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
newp.peering_crush_bucket_count = new_site_count;
newp.peering_crush_mandatory_member = remaining_site;
newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
newp.last_force_op_resend = pending_inc.epoch;
pending_inc.new_pools[pgi.first] = newp;
}
}
Expand All @@ -14403,7 +14404,7 @@ void OSDMonitor::trigger_recovery_stretch_mode()
for (auto pgi : osdmap.pools) {
if (pgi.second.peering_crush_bucket_count) {
pg_pool_t newp(pgi.second);
// bump up the min_size since we have extra replicas available...
newp.last_force_op_resend = pending_inc.epoch;
pending_inc.new_pools[pgi.first] = newp;
}
}
Expand Down Expand Up @@ -14473,6 +14474,7 @@ void OSDMonitor::trigger_healthy_stretch_mode()
newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
newp.peering_crush_mandatory_member = 0;
newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
newp.last_force_op_resend = pending_inc.epoch;
pending_inc.new_pools[pgi.first] = newp;
}
}
Expand Down
19 changes: 3 additions & 16 deletions src/osd/PeeringState.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2442,22 +2442,9 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
// didn't break them with earlier choices!
const pg_pool_t& pg_pool = pool.info;
if (pg_pool.is_stretch_pool()) {
const uint32_t barrier_id = pg_pool.peering_crush_bucket_barrier;
const uint32_t barrier_count = pg_pool.peering_crush_bucket_count;
set<int> ancestors;
const shared_ptr<CrushWrapper>& crush = osdmap_ref->crush;
for (int osdid : want) {
int ancestor = crush->get_parent_of_type(osdid, barrier_id,
pg_pool.crush_rule);
ancestors.insert(ancestor);
}
if (ancestors.size() < barrier_count) {
psdout(5) << "peeering blocked: not enough crush buckets with OSDs in acting" << dendl;
return false;
} else if (pg_pool.peering_crush_mandatory_member &&
!ancestors.count(pg_pool.peering_crush_mandatory_member)) {
psdout(5) << "peering blocked: missing mandatory crush bucket member "
<< pg_pool.peering_crush_mandatory_member << dendl;
stringstream ss;
if (!pg_pool.stretch_set_can_peer(want, *get_osdmap(), &ss)) {
psdout(5) << "peering blocked by stretch_can_peer: " << ss.str() << dendl;
return false;
}
}
Expand Down
50 changes: 48 additions & 2 deletions src/osd/osd_types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2237,6 +2237,35 @@ void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
calc_grade_table();
}

bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
std::ostream * out) const
{
const uint32_t barrier_id = peering_crush_bucket_barrier;
const uint32_t barrier_count = peering_crush_bucket_count;
set<int> ancestors;
const shared_ptr<CrushWrapper>& crush = osdmap.crush;
for (int osdid : want) {
int ancestor = crush->get_parent_of_type(osdid, barrier_id,
crush_rule);
ancestors.insert(ancestor);
}
if (ancestors.size() < barrier_count) {
if (out) {
*out << __func__ << ": not enough crush buckets with OSDs in want set "
<< want;
}
return false;
} else if (peering_crush_mandatory_member &&
!ancestors.count(peering_crush_mandatory_member)) {
if (out) {
*out << __func__ << ": missing mandatory crush bucket member "
<< peering_crush_mandatory_member;
}
return false;
}
return true;
}

void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
{
pg_pool_t a;
Expand Down Expand Up @@ -3951,6 +3980,14 @@ bool PastIntervals::is_new_interval(
bool new_sort_bitwise,
bool old_recovery_deletes,
bool new_recovery_deletes,
uint32_t old_crush_count,
uint32_t new_crush_count,
uint32_t old_crush_target,
uint32_t new_crush_target,
uint32_t old_crush_barrier,
uint32_t new_crush_barrier,
int32_t old_crush_member,
int32_t new_crush_member,
pg_t pgid) {
return old_acting_primary != new_acting_primary ||
new_acting != old_acting ||
Expand All @@ -3970,7 +4007,11 @@ bool PastIntervals::is_new_interval(
// merge target
pgid.is_merge_target(old_pg_num, new_pg_num) ||
old_sort_bitwise != new_sort_bitwise ||
old_recovery_deletes != new_recovery_deletes;
old_recovery_deletes != new_recovery_deletes ||
old_crush_count != new_crush_count ||
old_crush_target != new_crush_target ||
old_crush_barrier != new_crush_barrier ||
old_crush_member != new_crush_member;
}

bool PastIntervals::is_new_interval(
Expand Down Expand Up @@ -4015,6 +4056,10 @@ bool PastIntervals::is_new_interval(
osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
plast->peering_crush_bucket_count, pi->peering_crush_bucket_count,
plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
pgid);
}

Expand Down Expand Up @@ -4117,6 +4162,8 @@ bool PastIntervals::check_new_interval(
if (num_acting &&
i.primary != -1 &&
num_acting >= old_pg_pool.min_size &&
(!old_pg_pool.is_stretch_pool() ||
old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) &&
could_have_gone_active(old_acting_shards)) {
if (out)
*out << __func__ << " " << i
Expand Down Expand Up @@ -4168,7 +4215,6 @@ bool PastIntervals::check_new_interval(
}
}


// true if the given map affects the prior set
bool PastIntervals::PriorSet::affected_by_map(
const OSDMap &osdmap,
Expand Down
20 changes: 19 additions & 1 deletion src/osd/osd_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,8 @@ struct pg_merge_meta_t {
};
WRITE_CLASS_ENCODER(pg_merge_meta_t)

class OSDMap;

/*
* pg_pool
*/
Expand Down Expand Up @@ -1464,6 +1466,15 @@ struct pg_pool_t {
return peering_crush_bucket_count != 0;
}

bool stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
std::ostream *out) const;
bool stretch_set_can_peer(const vector<int>& want, const OSDMap& osdmap,
std::ostream *out) const {
set<int> swant;
for (auto i : want) swant.insert(i);
return stretch_set_can_peer(swant, osdmap, out);
}

uint64_t target_max_bytes = 0; ///< tiering: target max pool size
uint64_t target_max_objects = 0; ///< tiering: target max pool size

Expand Down Expand Up @@ -3066,7 +3077,6 @@ struct pg_fast_info_t {
WRITE_CLASS_ENCODER(pg_fast_info_t)


class OSDMap;
/**
* PastIntervals -- information needed to determine the PriorSet and
* the might_have_unfound set
Expand Down Expand Up @@ -3202,6 +3212,14 @@ class PastIntervals {
bool new_sort_bitwise,
bool old_recovery_deletes,
bool new_recovery_deletes,
uint32_t old_crush_count,
uint32_t new_crush_count,
uint32_t old_crush_target,
uint32_t new_crush_target,
uint32_t old_crush_barrier,
uint32_t new_crush_barrier,
int32_t old_crush_member,
int32_t new_crush_member,
pg_t pgid
);

Expand Down
12 changes: 12 additions & 0 deletions src/osdc/Objecter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2797,6 +2797,14 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
sort_bitwise,
t->recovery_deletes,
recovery_deletes,
t->peering_crush_bucket_count,
pi->peering_crush_bucket_count,
t->peering_crush_bucket_target,
pi->peering_crush_bucket_target,
t->peering_crush_bucket_barrier,
pi->peering_crush_bucket_barrier,
t->peering_crush_mandatory_member,
pi->peering_crush_mandatory_member,
prev_pgid)) {
force_resend = true;
}
Expand Down Expand Up @@ -2848,6 +2856,10 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
t->actual_pgid = spgid;
t->sort_bitwise = sort_bitwise;
t->recovery_deletes = recovery_deletes;
t->peering_crush_bucket_count = pi->peering_crush_bucket_count;
t->peering_crush_bucket_target = pi->peering_crush_bucket_target;
t->peering_crush_bucket_barrier = pi->peering_crush_bucket_barrier;
t->peering_crush_mandatory_member = pi->peering_crush_mandatory_member;
ldout(cct, 10) << __func__ << " "
<< " raw pgid " << pgid << " -> actual " << t->actual_pgid
<< " acting " << acting
Expand Down
4 changes: 4 additions & 0 deletions src/osdc/Objecter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1778,6 +1778,10 @@ class Objecter : public md_config_obs_t, public Dispatcher {
int min_size = -1; ///< the min size of the pool when were were last mapped
bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise
bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering
uint32_t peering_crush_bucket_count = 0;
uint32_t peering_crush_bucket_target = 0;
uint32_t peering_crush_bucket_barrier = 0;
int32_t peering_crush_mandatory_member = 0;

bool used_replica = false;
bool paused = false;
Expand Down

0 comments on commit 97fbd19

Please sign in to comment.