Skip to content

Commit

Permalink
Merge pull request #869 from ceph/wip-crush
Browse files Browse the repository at this point in the history
crush changes for erasure coding

Reviewed-by: Loic Dachary <loic@dachary.org>
Reviewed-by: Samuel Just <sam.just@inktank.com>
  • Loading branch information
Sage Weil committed Dec 8, 2013
2 parents 096f9b3 + e8fdef2 commit 94da215
Show file tree
Hide file tree
Showing 20 changed files with 11,087 additions and 111 deletions.
10 changes: 10 additions & 0 deletions PendingReleaseNotes
@@ -1,10 +1,20 @@
v0.73
~~~~~

- As part of fix for #6796, 'ceph osd pool set <pool> <var> <arg>' now
receives <arg> as an integer instead of a string. This affects how
'hashpspool' flag is set/unset: instead of 'true' or 'false', it now
must be '0' or '1'.

- The behavior of the CRUSH 'indep' choose mode has been changed. No
ceph cluster should have been using this behavior unless someone has
manually extracted a crush map, modified a CRUSH rule to replace
'firstn' with 'indep', recompiled, and reinjected the new map into
the cluster. If the 'indep' mode is currently in use on a cluster,
the rule should be modified to use 'firstn' instead, and the
administrator should wait until any data movement completes before
upgrading.

v0.72.2
~~~~~~~
- As part of fix for #6796, 'ceph osd pool set <pool> <var> <arg>' now
Expand Down
33 changes: 30 additions & 3 deletions src/crush/CrushCompiler.cc
Expand Up @@ -233,6 +233,9 @@ int CrushCompiler::decompile(ostream &out)
case CEPH_PG_TYPE_RAID4:
out << "\ttype raid4\n";
break;
case CEPH_PG_TYPE_ERASURE:
out << "\ttype erasure\n";
break;
default:
out << "\ttype " << crush.get_rule_mask_type(i) << "\n";
}
Expand All @@ -253,6 +256,14 @@ int CrushCompiler::decompile(ostream &out)
case CRUSH_RULE_EMIT:
out << "\tstep emit\n";
break;
case CRUSH_RULE_SET_CHOOSE_TRIES:
out << "\tstep set_choose_tries " << crush.get_rule_arg1(i, j)
<< "\n";
break;
case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
out << "\tstep set_chooseleaf_tries " << crush.get_rule_arg1(i, j)
<< "\n";
break;
case CRUSH_RULE_CHOOSE_FIRSTN:
out << "\tstep choose firstn "
<< crush.get_rule_arg1(i, j)
Expand All @@ -267,14 +278,14 @@ int CrushCompiler::decompile(ostream &out)
print_type_name(out, crush.get_rule_arg2(i, j), crush);
out << "\n";
break;
case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
out << "\tstep chooseleaf firstn "
<< crush.get_rule_arg1(i, j)
<< " type ";
print_type_name(out, crush.get_rule_arg2(i, j), crush);
out << "\n";
break;
case CRUSH_RULE_CHOOSE_LEAF_INDEP:
case CRUSH_RULE_CHOOSELEAF_INDEP:
out << "\tstep chooseleaf indep "
<< crush.get_rule_arg1(i, j)
<< " type ";
Expand Down Expand Up @@ -568,8 +579,10 @@ int CrushCompiler::parse_rule(iter_t const& i)
int type;
if (tname == "replicated")
type = CEPH_PG_TYPE_REP;
else if (tname == "raid4")
else if (tname == "raid4")
type = CEPH_PG_TYPE_RAID4;
else if (tname == "erasure")
type = CEPH_PG_TYPE_ERASURE;
else
assert(0);

Expand Down Expand Up @@ -601,6 +614,20 @@ int CrushCompiler::parse_rule(iter_t const& i)
}
break;

case crush_grammar::_step_set_choose_tries:
{
int val = int_node(s->children[1]);
crush.set_rule_step_set_choose_tries(ruleno, step++, val);
}
break;

case crush_grammar::_step_set_chooseleaf_tries:
{
int val = int_node(s->children[1]);
crush.set_rule_step_set_chooseleaf_tries(ruleno, step++, val);
}
break;

case crush_grammar::_step_choose:
case crush_grammar::_step_chooseleaf:
{
Expand Down
169 changes: 159 additions & 10 deletions src/crush/CrushWrapper.cc
Expand Up @@ -6,6 +6,23 @@

#define dout_subsys ceph_subsys_crush

bool CrushWrapper::has_v2_rules() const
{
// check rules for use of indep or new SET_* rule steps
for (unsigned i=0; i<crush->max_rules; i++) {
crush_rule *r = crush->rules[i];
if (!r)
continue;
for (unsigned j=0; j<r->len; j++) {
if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES)
return true;
}
}
return false;
}

void CrushWrapper::find_takes(set<int>& roots) const
{
Expand Down Expand Up @@ -639,7 +656,9 @@ void CrushWrapper::reweight(CephContext *cct)
}
}

int CrushWrapper::add_simple_rule(string name, string root_name, string failure_domain_name,
int CrushWrapper::add_simple_rule(string name, string root_name,
string failure_domain_name,
string mode,
ostream *err)
{
if (rule_exists(name)) {
Expand All @@ -662,6 +681,11 @@ int CrushWrapper::add_simple_rule(string name, string root_name, string failure_
return -EINVAL;
}
}
if (mode != "firstn" && mode != "indep") {
if (err)
*err << "unknown mode " << mode;
return -EINVAL;
}

int ruleset = 0;
for (int i = 0; i < get_max_rules(); i++) {
Expand All @@ -671,20 +695,28 @@ int CrushWrapper::add_simple_rule(string name, string root_name, string failure_
}
}

crush_rule *rule = crush_make_rule(3, ruleset, 1 /* pg_pool_t::TYPE_REP */, 1, 10);
int steps = 3;
if (mode == "indep")
steps = 4;
crush_rule *rule = crush_make_rule(steps, ruleset, 1 /* pg_pool_t::TYPE_REP */, 1, 10);
assert(rule);
crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
int step = 0;
if (mode == "indep")
crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
if (type)
crush_rule_set_step(rule, 1,
CRUSH_RULE_CHOOSE_LEAF_FIRSTN,
crush_rule_set_step(rule, step++,
mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN :
CRUSH_RULE_CHOOSELEAF_INDEP,
CRUSH_CHOOSE_N,
type);
else
crush_rule_set_step(rule, 1,
CRUSH_RULE_CHOOSE_FIRSTN,
crush_rule_set_step(rule, step++,
mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN :
CRUSH_RULE_CHOOSE_INDEP,
CRUSH_CHOOSE_N,
0);
crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
int rno = crush_add_rule(crush, rule, -1);
set_rule_name(rno, name);
have_rmaps = false;
Expand Down Expand Up @@ -1083,16 +1115,24 @@ void CrushWrapper::dump_rules(Formatter *f) const
f->dump_int("num", get_rule_arg1(i, j));
f->dump_string("type", get_type_name(get_rule_arg2(i, j)));
break;
case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
f->dump_string("op", "chooseleaf_firstn");
f->dump_int("num", get_rule_arg1(i, j));
f->dump_string("type", get_type_name(get_rule_arg2(i, j)));
break;
case CRUSH_RULE_CHOOSE_LEAF_INDEP:
case CRUSH_RULE_CHOOSELEAF_INDEP:
f->dump_string("op", "chooseleaf_indep");
f->dump_int("num", get_rule_arg1(i, j));
f->dump_string("type", get_type_name(get_rule_arg2(i, j)));
break;
case CRUSH_RULE_SET_CHOOSE_TRIES:
f->dump_string("op", "set_choose_tries");
f->dump_int("num", get_rule_arg1(i, j));
break;
case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
f->dump_string("op", "set_chooseleaf_tries");
f->dump_int("num", get_rule_arg1(i, j));
break;
default:
f->dump_int("opcode", get_rule_op(i, j));
f->dump_int("arg1", get_rule_arg1(i, j));
Expand All @@ -1114,6 +1154,115 @@ void CrushWrapper::list_rules(Formatter *f) const
}
}

struct qi {
int item;
int depth;
float weight;
qi() : item(0), depth(0), weight(0) {}
qi(int i, int d, float w) : item(i), depth(d), weight(w) {}
};

void CrushWrapper::dump_tree(const vector<__u32>& w, ostream *out, Formatter *f) const
{
if (out)
*out << "# id\tweight\ttype name\treweight\n";
if (f)
f->open_array_section("nodes");
set<int> touched;
set<int> roots;
find_roots(roots);
for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
list<qi> q;
q.push_back(qi(*p, 0, get_bucket_weight(*p) / (float)0x10000));
while (!q.empty()) {
int cur = q.front().item;
int depth = q.front().depth;
float weight = q.front().weight;
q.pop_front();

if (out) {
*out << cur << "\t";
int oldprecision = out->precision();
*out << std::setprecision(4) << weight << std::setprecision(oldprecision) << "\t";

for (int k=0; k<depth; k++)
*out << "\t";
}
if (f) {
f->open_object_section("item");
}
if (cur >= 0) {

if (f) {
f->dump_unsigned("id", cur);
f->dump_stream("name") << "osd." << cur;
f->dump_string("type", get_type_name(0));
f->dump_int("type_id", 0);
}
if (out)
*out << "osd." << cur << "\t";

double wf = (double)w[cur] / (double)0x10000;
if (out) {
std::streamsize p = out->precision();
*out << std::setprecision(4)
<< wf
<< std::setprecision(p)
<< "\t";
}
if (f) {
f->dump_float("reweight", wf);
}

if (out)
*out << "\n";
if (f) {
f->dump_float("crush_weight", weight);
f->dump_unsigned("depth", depth);
f->close_section();
}
touched.insert(cur);
}
if (cur >= 0) {
continue;
}

// queue bucket contents...
int type = get_bucket_type(cur);
int s = get_bucket_size(cur);
if (f) {
f->dump_int("id", cur);
f->dump_string("name", get_item_name(cur));
f->dump_string("type", get_type_name(type));
f->dump_int("type_id", type);
f->open_array_section("children");
}
for (int k=s-1; k>=0; k--) {
int item = get_bucket_item(cur, k);
q.push_front(qi(item, depth+1, (float)get_bucket_item_weight(cur, k) / (float)0x10000));
if (f)
f->dump_int("child", item);
}
if (f)
f->close_section();

if (out)
*out << get_type_name(type) << " " << get_item_name(cur) << "\n";
if (f) {
f->close_section();
}

}
}
if (f) {
f->close_section();
f->open_array_section("stray");
}

if (f)
f->close_section();
}

void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
{
o.push_back(new CrushWrapper);
Expand Down
17 changes: 13 additions & 4 deletions src/crush/CrushWrapper.h
Expand Up @@ -161,6 +161,7 @@ class CrushWrapper {
return
crush->chooseleaf_descend_once != 0;
}
bool has_v2_rules() const;

// bucket types
int get_num_type_names() const {
Expand Down Expand Up @@ -566,24 +567,30 @@ class CrushWrapper {
int set_rule_step_take(unsigned ruleno, unsigned step, int val) {
return set_rule_step(ruleno, step, CRUSH_RULE_TAKE, val, 0);
}
int set_rule_step_set_choose_tries(unsigned ruleno, unsigned step, int val) {
return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSE_TRIES, val, 0);
}
int set_rule_step_set_chooseleaf_tries(unsigned ruleno, unsigned step, int val) {
return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_TRIES, val, 0);
}
int set_rule_step_choose_firstn(unsigned ruleno, unsigned step, int val, int type) {
return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_FIRSTN, val, type);
}
int set_rule_step_choose_indep(unsigned ruleno, unsigned step, int val, int type) {
return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_INDEP, val, type);
}
int set_rule_step_choose_leaf_firstn(unsigned ruleno, unsigned step, int val, int type) {
return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, val, type);
return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_FIRSTN, val, type);
}
int set_rule_step_choose_leaf_indep(unsigned ruleno, unsigned step, int val, int type) {
return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_LEAF_INDEP, val, type);
return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_INDEP, val, type);
}
int set_rule_step_emit(unsigned ruleno, unsigned step) {
return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0);
}

int add_simple_rule(string name, string root_name, string failure_domain_type,
ostream *err = 0);
string mode, ostream *err = 0);

int remove_rule(int ruleno);

Expand Down Expand Up @@ -767,7 +774,8 @@ class CrushWrapper {
const vector<__u32>& weight) const {
Mutex::Locker l(mapper_lock);
int rawout[maxout];
int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0], weight.size());
int scratch[maxout * 3];
int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0], weight.size(), scratch);
if (numrep < 0)
numrep = 0;
out.resize(numrep);
Expand Down Expand Up @@ -796,6 +804,7 @@ class CrushWrapper {
void dump(Formatter *f) const;
void dump_rules(Formatter *f) const;
void list_rules(Formatter *f) const;
void dump_tree(const vector<__u32>& w, ostream *out, Formatter *f) const;
static void generate_test_instances(list<CrushWrapper*>& o);


Expand Down

0 comments on commit 94da215

Please sign in to comment.