Skip to content

Commit

Permalink
osd: use GMT time for the object name of hitsets
Browse files Browse the repository at this point in the history
* bump the encoding version of pg_hit_set_info_t to 2, so we can
  tell if the corresponding hit_set is named using localtime or
  GMT
* bump the encoding version of pg_pool_t to 20, so we can know
  if a pool is using GMT to name the hit_set archive or not. and
  we can tell if current cluster allows OSDs not support GMT
  mode or not.
* add an option named `osd_pool_use_gmt_hitset`. if enabled,
  the cluster will try to use GMT mode when creating a new pool
  if all the the up OSDs support GMT mode. if any of the
  pools in the cluster is using GMT mode, then only OSDs
  supporting GMT mode are allowed to join the cluster.

Fixes: #9732
Signed-off-by: Kefu Chai <kchai@redhat.com>
(cherry picked from commit 42f8c5d)

 Conflicts:
	src/include/ceph_features.h
	src/osd/ReplicatedPG.cc
	src/osd/osd_types.cc
	src/osd/osd_types.h
		fill pg_pool_t with default settings in master branch.
  • Loading branch information
tchaikov committed Oct 9, 2015
1 parent 629b631 commit 040e390
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 18 deletions.
1 change: 1 addition & 0 deletions src/common/config_opts.h
Expand Up @@ -497,6 +497,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages
OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
Expand Down
1 change: 1 addition & 0 deletions src/include/ceph_features.h
Expand Up @@ -64,6 +64,7 @@
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */
/* ... */
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)

Expand Down
21 changes: 20 additions & 1 deletion src/mon/OSDMonitor.cc
Expand Up @@ -16,6 +16,7 @@
*
*/

#include <algorithm>
#include <sstream>

#include "OSDMonitor.h"
Expand Down Expand Up @@ -1572,6 +1573,9 @@ void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
failure_info.clear();
}

static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
return pool.second.use_gmt_hitset;
}

// boot --

Expand Down Expand Up @@ -1641,6 +1645,19 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
}
}

if (std::find_if(osdmap.get_pools().begin(),
osdmap.get_pools().end(),
uses_gmt_hitset) != osdmap.get_pools().end()) {
assert(osdmap.get_num_up_osds() == 0 ||
osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
<< m->get_orig_source_inst()
<< " doesn't announce support -- ignore" << dendl;
goto ignore;
}
}

// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
Expand Down Expand Up @@ -4075,7 +4092,9 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
if (g_conf->osd_pool_default_flag_nosizechange)
pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);

if (g_conf->osd_pool_use_gmt_hitset &&
(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
pi->use_gmt_hitset = true;
pi->size = size;
pi->min_size = min_size;
pi->crush_ruleset = crush_ruleset;
Expand Down
27 changes: 17 additions & 10 deletions src/osd/ReplicatedPG.cc
Expand Up @@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
p != info.hit_set.history.end();
++p) {
if (stamp >= p->begin && stamp <= p->end) {
oid = get_hit_set_archive_object(p->begin, p->end);
oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
break;
}
}
Expand Down Expand Up @@ -10110,10 +10110,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
return hoid;
}

hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
utime_t end,
bool using_gmt)
{
ostringstream ss;
ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
ss << "hit_set_" << info.pgid.pgid << "_archive_";
if (using_gmt) {
start.gmtime(ss) << "_";
end.gmtime(ss);
} else {
start.localtime(ss) << "_";
end.localtime(ss);
}
hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
info.pgid.ps(), info.pgid.pool(),
cct->_conf->osd_hit_set_namespace);
Expand Down Expand Up @@ -10250,7 +10259,7 @@ void ReplicatedPG::hit_set_persist()
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);

// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
Expand All @@ -10259,10 +10268,8 @@ void ReplicatedPG::hit_set_persist()
return;
}

oid = get_hit_set_archive_object(start, now);
oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
// If the current object is degraded we skip this persist request
if (is_degraded_or_backfilling_object(oid))
return;
if (scrubber.write_blocked_by_scrub(oid))
return;

Expand Down Expand Up @@ -10353,7 +10360,7 @@ void ReplicatedPG::hit_set_persist()

updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
hit_set_create();
updated_hit_set_hist.current_info = pg_hit_set_info_t();
updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
updated_hit_set_hist.current_last_stamp = utime_t();

// fabricate an object_info_t and SnapSet
Expand Down Expand Up @@ -10416,7 +10423,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
assert(p != updated_hit_set_hist.history.end());
hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);

assert(!is_degraded_or_backfilling_object(oid));

Expand Down Expand Up @@ -10701,7 +10708,7 @@ void ReplicatedPG::agent_load_hit_sets()
continue;
}

hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
if (is_unreadable_object(oid)) {
dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
break;
Expand Down
4 changes: 3 additions & 1 deletion src/osd/ReplicatedPG.h
Expand Up @@ -903,7 +903,9 @@ class ReplicatedPG : public PG, public PGBackend::Listener {
void hit_set_in_memory_trim(); ///< discard old in memory HitSets

hobject_t get_hit_set_current_object(utime_t stamp);
hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
hobject_t get_hit_set_archive_object(utime_t start,
utime_t end,
bool using_gmt);

// agent
boost::scoped_ptr<TierAgentState> agent_state;
Expand Down
26 changes: 24 additions & 2 deletions src/osd/osd_types.cc
Expand Up @@ -926,6 +926,7 @@ void pg_pool_t::dump(Formatter *f) const
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
f->dump_bool("use_gmt_hitset", use_gmt_hitset);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
f->dump_unsigned("expected_num_objects", expected_num_objects);
Expand Down Expand Up @@ -1280,6 +1281,9 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(last_force_op_resend, bl);
::encode(min_read_recency_for_promote, bl);
::encode(expected_num_objects, bl);
::encode(uint32_t(.6 * 1e6), bl);
::encode(uint32_t(1), bl);
::encode(use_gmt_hitset, bl);
ENCODE_FINISH(bl);
}

Expand Down Expand Up @@ -1397,6 +1401,17 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
} else {
expected_num_objects = 0;
}
if (struct_v >= 19) {
uint32_t dummy;
::decode(dummy, bl);
}
if (struct_v >= 20) {
uint32_t dummy;
::decode(dummy, bl);
::decode(use_gmt_hitset, bl);
} else {
use_gmt_hitset = false;
}
DECODE_FINISH(bl);
calc_pg_masks();
}
Expand Down Expand Up @@ -3789,19 +3804,25 @@ void pg_create_t::generate_test_instances(list<pg_create_t*>& o)

void pg_hit_set_info_t::encode(bufferlist& bl) const
{
ENCODE_START(1, 1, bl);
ENCODE_START(2, 1, bl);
::encode(begin, bl);
::encode(end, bl);
::encode(version, bl);
::encode(using_gmt, bl);
ENCODE_FINISH(bl);
}

void pg_hit_set_info_t::decode(bufferlist::iterator& p)
{
DECODE_START(1, p);
DECODE_START(2, p);
::decode(begin, p);
::decode(end, p);
::decode(version, p);
if (struct_v >= 2) {
::decode(using_gmt, p);
} else {
using_gmt = false;
}
DECODE_FINISH(p);
}

Expand All @@ -3810,6 +3831,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const
f->dump_stream("begin") << begin;
f->dump_stream("end") << end;
f->dump_stream("version") << version;
f->dump_stream("using_gmt") << using_gmt;
}

void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
Expand Down
11 changes: 7 additions & 4 deletions src/osd/osd_types.h
Expand Up @@ -1035,6 +1035,7 @@ struct pg_pool_t {
HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
uint32_t hit_set_count; ///< number of periods to retain
bool use_gmt_hitset; ///< use gmt to name the hitset archive object
uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote

uint32_t stripe_width; ///< erasure coded stripe size in bytes
Expand Down Expand Up @@ -1063,6 +1064,7 @@ struct pg_pool_t {
hit_set_params(),
hit_set_period(0),
hit_set_count(0),
use_gmt_hitset(true),
min_read_recency_for_promote(0),
stripe_width(0),
expected_num_objects(0)
Expand Down Expand Up @@ -1600,10 +1602,11 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
struct pg_hit_set_info_t {
utime_t begin, end; ///< time interval
eversion_t version; ///< version this HitSet object was written

pg_hit_set_info_t() {}
pg_hit_set_info_t(utime_t b)
: begin(b) {}
bool using_gmt; ///< use gmt for creating the hit_set archive object name
pg_hit_set_info_t(bool using_gmt = true)
: using_gmt(using_gmt) {}
pg_hit_set_info_t(utime_t b, bool using_gmt)
: begin(b), using_gmt(using_gmt) {}

void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);
Expand Down

0 comments on commit 040e390

Please sign in to comment.