Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mimic: crush: backport recent upmap fixes #27963

Merged
merged 6 commits into from May 9, 2019

osd/OSDMap: calc_pg_upmaps - restrict optimization to origin pools only

The current implementation will try to cancel any pg_upmaps that
would otherwise re-map a PG out from an underfull osd, which is wrong,
e.g., because it could reliably fire the following assert:

src/osd/OSDMap.cc: 4405: FAILED assert(osd_weight.count(i.first))

Also it would not match the expectation if automatic balancing
has been strictly restricted to some specific pools by admin.

Fix by excluding any wild PG that does not belong to the origin pools
passed in when trying to do upmap/unmap.

Fixes: http://tracker.ceph.com/issues/38897
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit 01e8e94)
  • Loading branch information...
xiexingguo committed Mar 23, 2019
commit ef6b8d47fd34e053744e93eac414caf778e543a9
@@ -4483,6 +4483,8 @@ int OSDMap::calc_pg_upmaps(
for (auto& i : tmp.pg_upmap_items) {
if (to_skip.count(i.first))
continue;
if (!only_pools.empty() && !only_pools.count(i.first.pool()))
continue;
candidates.push_back(make_pair(i.first, i.second));
}
if (aggressive) {
@@ -27,7 +27,7 @@ int main(int argc, char **argv) {
}

class OSDMapTest : public testing::Test {
const static int num_osds = 6;
int num_osds = 6;
public:
OSDMap osdmap;
OSDMapMapping mapping;
@@ -37,7 +37,8 @@ class OSDMapTest : public testing::Test {

OSDMapTest() {}

void set_up_map() {
void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
num_osds = new_num_osds;
uuid_d fsid;
osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
@@ -56,6 +57,8 @@ class OSDMapTest : public testing::Test {
pending_inc.new_uuid[i] = sample_uuid;
}
osdmap.apply_incremental(pending_inc);
if (no_default_pools) // do not create any default pool(s)
return;

// Create an EC ruleset and a pool using it
int r = osdmap.crush->add_simple_rule(
@@ -1063,6 +1066,229 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
}
}

TEST_F(OSDMapTest, BUG_38897) {
// http://tracker.ceph.com/issues/38897
// build a fresh map with 12 OSDs, without any default pools
set_up_map(12, true);
const string pool_1("pool1");
const string pool_2("pool2");
int64_t pool_1_id = -1;

{
// build customized crush rule for "pool1"
string host_name = "host_for_pool_1";
// build a customized host to capture osd.1~5
for (int i = 1; i < 5; i++) {
stringstream osd_name;
vector<string> move_to;
osd_name << "osd." << i;
move_to.push_back("root=default");
string host_loc = "host=" + host_name;
move_to.push_back(host_loc);
auto r = crush_move(osdmap, osd_name.str(), move_to);
ASSERT_EQ(0, r);
}
CrushWrapper crush;
get_crush(osdmap, crush);
auto host_id = crush.get_item_id(host_name);
ASSERT_TRUE(host_id < 0);
string rule_name = "rule_for_pool1";
int rule_type = pg_pool_t::TYPE_REPLICATED;
ASSERT_TRUE(!crush.rule_exists(rule_name));
int rno;
for (rno = 0; rno < crush.get_max_rules(); rno++) {
if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
break;
}
int min_size = 3;
int max_size = 3;
int steps = 7;
crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
int step = 0;
crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
// always choose osd.0
crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
// then pick any other random osds
crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
ASSERT_TRUE(step == steps);
auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
ASSERT_TRUE(r >= 0);
crush.set_rule_name(rno, rule_name);
{
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
pending_inc.crush.clear();
crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
osdmap.apply_incremental(pending_inc);
}

// create "pool1"
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
pending_inc.new_pool_max = osdmap.get_pool_max();
auto pool_id = ++pending_inc.new_pool_max;
pool_1_id = pool_id;
pg_pool_t empty;
auto p = pending_inc.get_new_pool(pool_id, &empty);
p->size = 3;
p->min_size = 1;
p->set_pg_num(3);
p->set_pgp_num(3);
p->type = pg_pool_t::TYPE_REPLICATED;
p->crush_rule = rno;
p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
pending_inc.new_pool_names[pool_id] = pool_1;
osdmap.apply_incremental(pending_inc);
ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
{
for (unsigned i = 0; i < 3; i++) {
// 1.x -> [1]
pg_t rawpg(i, pool_id);
pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
vector<int> up;
int up_primary;
osdmap.pg_to_raw_up(pgid, &up, &up_primary);
ASSERT_TRUE(up.size() == 3);
ASSERT_TRUE(up[0] == 0);

// insert a new pg_upmap
vector<int32_t> new_up;
// and remap 1.x to osd.1 only
// this way osd.0 is deemed to be *underfull*
// and osd.1 is deemed to be *overfull*
new_up.push_back(1);
{
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
new_up.begin(), new_up.end());
osdmap.apply_incremental(pending_inc);
}
osdmap.pg_to_raw_up(pgid, &up, &up_primary);
ASSERT_TRUE(up.size() == 1);
ASSERT_TRUE(up[0] == 1);
}
}
}

{
// build customized crush rule for "pool2"
string host_name = "host_for_pool_2";
// build a customized host to capture osd.6~11
for (int i = 6; i < (int)get_num_osds(); i++) {
stringstream osd_name;
vector<string> move_to;
osd_name << "osd." << i;
move_to.push_back("root=default");
string host_loc = "host=" + host_name;
move_to.push_back(host_loc);
auto r = crush_move(osdmap, osd_name.str(), move_to);
ASSERT_EQ(0, r);
}
CrushWrapper crush;
get_crush(osdmap, crush);
auto host_id = crush.get_item_id(host_name);
ASSERT_TRUE(host_id < 0);
string rule_name = "rule_for_pool2";
int rule_type = pg_pool_t::TYPE_REPLICATED;
ASSERT_TRUE(!crush.rule_exists(rule_name));
int rno;
for (rno = 0; rno < crush.get_max_rules(); rno++) {
if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
break;
}
int min_size = 3;
int max_size = 3;
int steps = 7;
crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
int step = 0;
crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
// always choose osd.0
crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
// then pick any other random osds
crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
ASSERT_TRUE(step == steps);
auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
ASSERT_TRUE(r >= 0);
crush.set_rule_name(rno, rule_name);
{
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
pending_inc.crush.clear();
crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
osdmap.apply_incremental(pending_inc);
}

// create "pool2"
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
pending_inc.new_pool_max = osdmap.get_pool_max();
auto pool_id = ++pending_inc.new_pool_max;
pg_pool_t empty;
auto p = pending_inc.get_new_pool(pool_id, &empty);
p->size = 3;
// include a single PG
p->set_pg_num(1);
p->set_pgp_num(1);
p->type = pg_pool_t::TYPE_REPLICATED;
p->crush_rule = rno;
p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
pending_inc.new_pool_names[pool_id] = pool_2;
osdmap.apply_incremental(pending_inc);
ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
pg_t rawpg(0, pool_id);
pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
vector<int> up;
int up_primary;
osdmap.pg_to_raw_up(pgid, &up, &up_primary);
ASSERT_TRUE(up.size() == 3);
ASSERT_TRUE(up[0] == 0);

{
// build a pg_upmap_item that will
// remap pg out from *underfull* osd.0
vector<pair<int32_t,int32_t>> new_pg_upmap_items;
new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
pending_inc.new_pg_upmap_items[pgid] =
mempool::osdmap::vector<pair<int32_t,int32_t>>(
new_pg_upmap_items.begin(), new_pg_upmap_items.end());
osdmap.apply_incremental(pending_inc);
ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
vector<int> up;
int up_primary;
osdmap.pg_to_raw_up(pgid, &up, &up_primary);
ASSERT_TRUE(up.size() == 3);
ASSERT_TRUE(up[0] == 10);
}
}

// ready to go
{
// require perfect distribution!
auto ret = g_ceph_context->_conf->set_val(
"osd_calc_pg_upmaps_max_stddev", "0");
ASSERT_EQ(0, ret);
g_ceph_context->_conf->apply_changes(nullptr);
set<int64_t> only_pools;
ASSERT_TRUE(pool_1_id >= 0);
only_pools.insert(pool_1_id);
OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
osdmap.calc_pg_upmaps(g_ceph_context,
0, // so we can force optimizing
100,
only_pools,
&pending_inc);
osdmap.apply_incremental(pending_inc);
}
}

TEST(PGTempMap, basic)
{
PGTempMap m;
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.