Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osd: Fix a bunch of stretch peering issues #40049

Merged
merged 10 commits into from
Mar 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
53 changes: 28 additions & 25 deletions src/osd/PeeringState.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1988,7 +1988,7 @@ void PeeringState::calc_replicated_acting_stretch(
osd,
pool.info.peering_crush_bucket_barrier,
pool.info.crush_rule);
return ancestors[ancestor];
return &ancestors[ancestor];
};

unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
Expand All @@ -2005,7 +2005,7 @@ void PeeringState::calc_replicated_acting_stretch(
want->push_back(osd);
acting_backfill->insert(
pg_shard_t(osd, shard_id_t::NO_SHARD));
get_ancestor(osd).inc_selected();
get_ancestor(osd)->inc_selected();
}
};
add_required(primary->first.osd);
Expand All @@ -2024,7 +2024,7 @@ void PeeringState::calc_replicated_acting_stretch(
}
}

if (want->size() >= pool.info.size) {
if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid
ss << " up set sufficient" << std::endl;
return;
}
Expand Down Expand Up @@ -2053,7 +2053,9 @@ void PeeringState::calc_replicated_acting_stretch(
}
if (!restrict_to_up_acting) {
for (auto &[cand, info] : all_info) {
if (!used(cand.osd) && usable_info(info)) {
if (!used(cand.osd) && usable_info(info) &&
(std::find(acting.begin(), acting.end(), cand.osd)
== acting.end())) {
ss << " other candidate " << cand << " " << info << std::endl;
candidates.push_back(
std::make_pair(get_osd_ord(false, info), cand.osd));
Expand All @@ -2064,7 +2066,7 @@ void PeeringState::calc_replicated_acting_stretch(

// We then filter these candidates by ancestor
std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
get_ancestor(cand.second).add_osd(cand.first, cand.second);
get_ancestor(cand.second)->add_osd(cand.first, cand.second);
});
}

Expand Down Expand Up @@ -2093,7 +2095,7 @@ void PeeringState::calc_replicated_acting_stretch(
if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
if (aiter != ancestors.end() &&
aiter->second.get_num_selected()) {
!aiter->second.get_num_selected()) {
ss << " adding required ancestor " << aiter->first << std::endl;
ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
pop_ancestor(aiter->second);
Expand All @@ -2107,7 +2109,13 @@ void PeeringState::calc_replicated_acting_stretch(
aheap.push_if_nonempty(anc.second);
});

/* and pull from this heap until it's empty or we have enough. */
/* and pull from this heap until it's empty or we have enough.
* "We have enough" is a sufficient check here for
* stretch_set_can_peer() because our heap sorting always
* pulls from ancestors with the least number of included OSDs,
* so if it is possible to satisfy the bucket_count constraints we
* will do so.
*/
while (!aheap.is_empty() && want->size() < pool.info.size) {
auto next = aheap.pop();
pop_ancestor(next.get());
Expand Down Expand Up @@ -2267,13 +2275,18 @@ void PeeringState::choose_async_recovery_replicated(
vector<int> candidate_want(*want);
for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
if (*it == cur_shard.osd) {
candidate_want.erase(it);
want->swap(candidate_want);
async_recovery->insert(cur_shard);
break;
candidate_want.erase(it);
if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) {
// if we're in stretch mode, we can only remove the osd if it doesn't
// break peering limits.
want->swap(candidate_want);
async_recovery->insert(cur_shard);
}
break;
}
}
}

psdout(20) << __func__ << " result want=" << *want
<< " async_recovery=" << *async_recovery << dendl;
}
Expand Down Expand Up @@ -2431,16 +2444,6 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
}
return false;
}
// make sure we respect the stretch cluster rules -- and
// didn't break them with earlier choices!
const pg_pool_t& pg_pool = pool.info;
if (pg_pool.is_stretch_pool()) {
stringstream ss;
if (!pg_pool.stretch_set_can_peer(want, *get_osdmap(), &ss)) {
psdout(5) << "peering blocked by stretch_can_peer: " << ss.str() << dendl;
return false;
}
}

if (request_pg_temp_change_only)
return true;
Expand Down Expand Up @@ -2651,7 +2654,7 @@ void PeeringState::activate(

if (is_primary()) {
// only update primary last_epoch_started if we will go active
if (acting.size() >= pool.info.min_size) {
if (acting_set_writeable()) {
ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
info.last_epoch_started <= activation_epoch);
info.last_epoch_started = activation_epoch;
Expand Down Expand Up @@ -2952,7 +2955,7 @@ void PeeringState::activate(
state_set(PG_STATE_ACTIVATING);
pl->on_activate(std::move(to_trim));
}
if (acting.size() >= pool.info.min_size) {
if (acting_set_writeable()) {
PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
pg_log.roll_forward(rollbacker.get());
}
Expand Down Expand Up @@ -6217,7 +6220,7 @@ boost::statechart::result PeeringState::Active::react(const AllReplicasActivated
pl->set_not_ready_to_merge_source(pgid);
}
}
} else if (ps->acting.size() < ps->pool.info.min_size) {
} else if (!ps->acting_set_writeable()) {
ps->state_set(PG_STATE_PEERED);
} else {
ps->state_set(PG_STATE_ACTIVE);
Expand Down Expand Up @@ -6375,7 +6378,7 @@ boost::statechart::result PeeringState::ReplicaActive::react(
{}, /* lease */
ps->get_lease_ack());

if (ps->acting.size() >= ps->pool.info.min_size) {
if (ps->acting_set_writeable()) {
ps->state_set(PG_STATE_ACTIVE);
} else {
ps->state_set(PG_STATE_PEERED);
Expand Down
10 changes: 10 additions & 0 deletions src/osd/PeeringState.h
Original file line number Diff line number Diff line change
Expand Up @@ -2312,6 +2312,16 @@ class PeeringState : public MissingLoc::MappingInfo {
hoid, get_min_last_complete_ondisk());
}

/**
* Returns whether the current acting set is able to go active
* and serve writes. It needs to satisfy min_size and any
* applicable stretch cluster constraints.
*/
bool acting_set_writeable() {
return (acting.size() >= pool.info.min_size) &&
(pool.info.stretch_set_can_peer(acting, *get_osdmap(), NULL));
}

/**
* Returns whether all peers which might have unfound objects have been
* queried or marked lost.
Expand Down
1 change: 1 addition & 0 deletions src/osd/osd_types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2239,6 +2239,7 @@ void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
std::ostream * out) const
{
if (!is_stretch_pool()) return true;
const uint32_t barrier_id = peering_crush_bucket_barrier;
const uint32_t barrier_count = peering_crush_bucket_count;
set<int> ancestors;
Expand Down
1 change: 1 addition & 0 deletions src/osd/osd_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -1521,6 +1521,7 @@ struct pg_pool_t {
std::ostream *out) const;
bool stretch_set_can_peer(const vector<int>& want, const OSDMap& osdmap,
std::ostream *out) const {
if (!is_stretch_pool()) return true;
set<int> swant;
for (auto i : want) swant.insert(i);
return stretch_set_can_peer(swant, osdmap, out);
Expand Down
35 changes: 35 additions & 0 deletions src/script/add_osd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

set -ex

CEPH_DEV_DIR=dev
CEPH_BIN=bin
ceph_adm=$CEPH_BIN/ceph
osd=$1
location=$2
weight=.0990

# DANGEROUS
rm -rf $CEPH_DEV_DIR/osd$osd
mkdir -p $CEPH_DEV_DIR/osd$osd

uuid=`uuidgen`
echo "add osd$osd $uuid"
OSD_SECRET=$($CEPH_BIN/ceph-authtool --gen-print-key)
echo "{\"cephx_secret\": \"$OSD_SECRET\"}" > $CEPH_DEV_DIR/osd$osd/new.json
$CEPH_BIN/ceph osd new $uuid -i $CEPH_DEV_DIR/osd$osd/new.json
rm $CEPH_DEV_DIR/osd$osd/new.json
$CEPH_BIN/ceph-osd -i $osd $ARGS --mkfs --key $OSD_SECRET --osd-uuid $uuid

key_fn=$CEPH_DEV_DIR/osd$osd/keyring
cat > $key_fn<<EOF
[osd.$osd]
key = $OSD_SECRET
EOF
echo adding osd$osd key to auth repository
$CEPH_BIN/ceph -i "$key_fn" auth add osd.$osd osd "allow *" mon "allow profile osd" mgr "allow profile osd"

$CEPH_BIN/ceph osd crush add osd.$osd $weight $location

echo start osd.$osd
$CEPH_BIN/ceph-osd -i $osd $ARGS $COSD_ARGS
8 changes: 8 additions & 0 deletions src/script/extend_stretch_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

set -ex

../src/script/add_osd.sh 4 'host=host1-1 datacenter=site1 root=default'
../src/script/add_osd.sh 5 'host=host1-2 datacenter=site1 root=default'
../src/script/add_osd.sh 6 'host=host2-1 datacenter=site2 root=default'
../src/script/add_osd.sh 7 'host=host2-2 datacenter=site2 root=default'
10 changes: 5 additions & 5 deletions src/script/set_up_stretch_mode.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ set -x

./bin/ceph config set osd osd_crush_update_on_start false

./bin/ceph osd crush move osd.0 host=host1-1 datacenter=site1
./bin/ceph osd crush move osd.1 host=host1-2 datacenter=site1
./bin/ceph osd crush move osd.2 host=host2-1 datacenter=site2
./bin/ceph osd crush move osd.3 host=host2-2 datacenter=site2
./bin/ceph osd crush move osd.0 host=host1-1 datacenter=site1 root=default
./bin/ceph osd crush move osd.1 host=host1-2 datacenter=site1 root=default
./bin/ceph osd crush move osd.2 host=host2-1 datacenter=site2 root=default
./bin/ceph osd crush move osd.3 host=host2-2 datacenter=site2 root=default

./bin/ceph osd getcrushmap > crush.map.bin
./bin/crushtool -d crush.map.bin -o crush.map.txt
Expand All @@ -32,5 +32,5 @@ EOF
./bin/ceph mon set_location a datacenter=site1
./bin/ceph mon set_location b datacenter=site2
./bin/ceph mon set_location c datacenter=site3
./bin/ceph osd pool create test_stretch1 replicated
./bin/ceph osd pool create test_stretch1 1024 1024 replicated
./bin/ceph mon enable_stretch_mode c stretch_rule datacenter