Skip to content

Commit

Permalink
Merge pull request #1086 from ceph/wip-temp-primary
Browse files Browse the repository at this point in the history
Add a "primary temp" to go with "pg temp"

Reviewed-by: Sage Weil <sage@inktank.com>
  • Loading branch information
Sage Weil committed Jan 16, 2014
2 parents 8b09a43 + e7b3236 commit 3b696f2
Show file tree
Hide file tree
Showing 13 changed files with 766 additions and 136 deletions.
4 changes: 2 additions & 2 deletions src/client/SyntheticClient.cc
Expand Up @@ -1679,7 +1679,7 @@ int SyntheticClient::dump_placement(string& fn) {
for (vector<ObjectExtent>::iterator i = extents.begin();
i != extents.end(); ++i) {

int osd = client->osdmap->get_pg_primary(client->osdmap->object_locator_to_pg(i->oid, i->oloc));
int osd = client->osdmap->get_pg_acting_primary(client->osdmap->object_locator_to_pg(i->oid, i->oloc));

// run through all the buffer extents
for (vector<pair<uint64_t, uint64_t> >::iterator j = i->buffer_extents.begin();
Expand Down Expand Up @@ -1959,7 +1959,7 @@ int SyntheticClient::overload_osd_0(int n, int size, int wrsize) {
int SyntheticClient::check_first_primary(int fh) {
vector<ObjectExtent> extents;
client->enumerate_layout(fh, extents, 1, 0);
return client->osdmap->get_pg_primary(client->osdmap->object_locator_to_pg(extents.begin()->oid,
return client->osdmap->get_pg_acting_primary(client->osdmap->object_locator_to_pg(extents.begin()->oid,
extents.begin()->oloc));
}

Expand Down
2 changes: 2 additions & 0 deletions src/include/ceph_features.h
Expand Up @@ -44,6 +44,7 @@
#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39) /* supports new-style OSDMap encoding */

/*
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
Expand Down Expand Up @@ -110,6 +111,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
CEPH_FEATURE_CRUSH_V2 | \
CEPH_FEATURE_EXPORT_PEER | \
CEPH_FEATURE_OSD_ERASURE_CODES | \
CEPH_FEATURE_OSDMAP_ENC | \
0ULL)

#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
Expand Down
66 changes: 22 additions & 44 deletions src/mon/OSDMonitor.cc
Expand Up @@ -248,6 +248,10 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
osd_epoch.erase(p++);
}

/** we don't have any of the feature bit infrastructure in place for
* supporting primary_temp mappings without breaking old clients/OSDs.*/
assert(osdmap.primary_temp->empty());

if (mon->is_leader()) {
// kick pgmon, make sure it's seen the latest map
mon->pgmon()->check_osd_map(osdmap.epoch);
Expand Down Expand Up @@ -410,45 +414,6 @@ void OSDMonitor::update_logger()
mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
}

void OSDMonitor::remove_redundant_pg_temp()
{
dout(10) << "remove_redundant_pg_temp" << dendl;

for (map<pg_t,vector<int> >::iterator p = osdmap.pg_temp->begin();
p != osdmap.pg_temp->end();
++p) {
if (pending_inc.new_pg_temp.count(p->first) == 0) {
vector<int> raw_up;
osdmap.pg_to_raw_up(p->first, raw_up);
if (raw_up == p->second) {
dout(10) << " removing unnecessary pg_temp " << p->first << " -> " << p->second << dendl;
pending_inc.new_pg_temp[p->first].clear();
}
}
}
}

void OSDMonitor::remove_down_pg_temp()
{
dout(10) << "remove_down_pg_temp" << dendl;
OSDMap tmpmap(osdmap);
tmpmap.apply_incremental(pending_inc);

for (map<pg_t,vector<int> >::iterator p = tmpmap.pg_temp->begin();
p != tmpmap.pg_temp->end();
++p) {
unsigned num_up = 0;
for (vector<int>::iterator i = p->second.begin();
i != p->second.end();
++i) {
if (!tmpmap.is_down(*i))
++num_up;
}
if (num_up == 0)
pending_inc.new_pg_temp[p->first].clear();
}
}

/* Assign a lower weight to overloaded OSDs.
*
* The osds that will get a lower weight are those with with a utilization
Expand Down Expand Up @@ -537,10 +502,10 @@ void OSDMonitor::create_pending()
dout(10) << "create_pending e " << pending_inc.epoch << dendl;

// drop any redundant pg_temp entries
remove_redundant_pg_temp();
OSDMap::remove_redundant_temporaries(g_ceph_context, osdmap, &pending_inc);

// drop any pg_temp entries with no up entries
remove_down_pg_temp();
// drop any pg or primary_temp entries with no up entries
OSDMap::remove_down_temps(g_ceph_context, osdmap, &pending_inc);
}

/**
Expand Down Expand Up @@ -2239,7 +2204,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
vector<int> up, acting;
osdmap.pg_to_up_acting_osds(mpgid, up, acting);
int up_p, acting_p;
osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);

string fullobjname;
if (!namespacestr.empty())
Expand All @@ -2255,15 +2221,18 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
f->dump_stream("raw_pgid") << pgid;
f->dump_stream("pgid") << mpgid;
f->dump_stream("up") << up;
f->dump_int("up_primary", up_p);
f->dump_stream("acting") << acting;
f->dump_int("acting_primary", acting_p);
f->close_section(); // osd_map
f->flush(rdata);
} else {
ds << "osdmap e" << osdmap.get_epoch()
<< " pool '" << poolstr << "' (" << pool << ")"
<< " object '" << fullobjname << "' ->"
<< " pg " << pgid << " (" << mpgid << ")"
<< " -> up " << up << " acting " << acting;
<< " -> up (" << up << ", p" << up_p << ") acting ("
<< acting << ", p" << acting_p << ")";
rdata.append(ds);
}
} else if ((prefix == "osd scrub" ||
Expand Down Expand Up @@ -4646,6 +4615,15 @@ int OSDMonitor::_prepare_remove_pool(uint64_t pool)
pending_inc.new_pg_temp[p->first].clear();
}
}
for (map<pg_t,int>::iterator p = osdmap.primary_temp->begin();
p != osdmap.primary_temp->end();
++p) {
if (p->first.pool() == pool) {
dout(10) << "_prepare_remove_pool " << pool
<< " removing obsolete primary_temp" << p->first << dendl;
pending_inc.new_primary_temp[p->first] = -1;
}
}
return 0;
}

Expand Down
2 changes: 0 additions & 2 deletions src/mon/OSDMonitor.h
Expand Up @@ -204,8 +204,6 @@ class OSDMonitor : public PaxosService {
void send_incremental(PaxosServiceMessage *m, epoch_t first);
void send_incremental(epoch_t first, entity_inst_t& dest, bool onetime);

void remove_redundant_pg_temp();
void remove_down_pg_temp();
int reweight_by_utilization(int oload, std::string& out_str);

bool check_source(PaxosServiceMessage *m, uuid_d fsid);
Expand Down
25 changes: 10 additions & 15 deletions src/osd/OSD.cc
Expand Up @@ -1713,7 +1713,7 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
dout(10) << "Adding newly split pg " << *pg << dendl;
vector<int> up, acting;
pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid, up, acting);
int role = pg->get_osdmap()->calc_pg_role(service.whoami, acting);
int role = OSDMap::calc_pg_role(service.whoami, acting);
pg->set_role(role);
pg->reg_next_scrub();
pg->handle_loaded(rctx);
Expand Down Expand Up @@ -1967,7 +1967,7 @@ void OSD::load_pgs()

// generate state for PG's current mapping
pg->get_osdmap()->pg_to_up_acting_osds(pgid, pg->up, pg->acting);
int role = pg->get_osdmap()->calc_pg_role(whoami, pg->acting);
int role = OSDMap::calc_pg_role(whoami, pg->acting);
pg->set_role(role);

PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
Expand Down Expand Up @@ -2146,8 +2146,6 @@ void OSD::handle_pg_peering_evt(

bool create = false;
if (primary) {
assert(role == 0); // otherwise, probably bug in project_pg_history.

// DNE on source?
if (info.dne()) {
// is there a creation pending on this pg?
Expand All @@ -2165,8 +2163,7 @@ void OSD::handle_pg_peering_evt(
}
creating_pgs.erase(info.pgid);
} else {
assert(role != 0); // i should be replica
assert(!info.dne()); // and pg exists if we are hearing about it
assert(!info.dne()); // pg exists if we are hearing about it
}

// do we need to resurrect a deleting pg?
Expand Down Expand Up @@ -5469,9 +5466,9 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin)

// am i still primary?
vector<int> acting;
int nrep = osdmap->pg_to_acting_osds(pgid, acting);
int role = osdmap->calc_pg_role(whoami, acting, nrep);
if (role != 0) {
int primary;
osdmap->pg_to_acting_osds(pgid, &acting, &primary);
if (primary != whoami) {
dout(10) << " no longer primary for " << pgid << ", stopping creation" << dendl;
creating_pgs.erase(p);
} else {
Expand All @@ -5488,7 +5485,6 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin)
while (p != waiting_for_pg.end()) {
pg_t pgid = p->first;

// am i still primary?
vector<int> acting;
int nrep = osdmap->pg_to_acting_osds(pgid, acting);
int role = osdmap->calc_pg_role(whoami, acting, nrep);
Expand Down Expand Up @@ -5951,10 +5947,11 @@ void OSD::handle_pg_create(OpRequestRef op)

// is it still ours?
vector<int> up, acting;
osdmap->pg_to_up_acting_osds(on, up, acting);
int up_primary, acting_primary;
osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
int role = osdmap->calc_pg_role(whoami, acting, acting.size());

if (role != 0) {
if (up_primary != whoami) {
dout(10) << "mkpg " << pgid << " not primary (role=" << role << "), skipping" << dendl;
continue;
}
Expand Down Expand Up @@ -6601,7 +6598,6 @@ void OSD::handle_pg_query(OpRequestRef op)
// get active crush mapping
vector<int> up, acting;
osdmap->pg_to_up_acting_osds(pgid, up, acting);
int role = osdmap->calc_pg_role(whoami, acting, acting.size());

// same primary?
pg_history_t history = it->second.history;
Expand All @@ -6616,7 +6612,6 @@ void OSD::handle_pg_query(OpRequestRef op)
continue;
}

assert(role != 0);
dout(10) << " pg " << pgid << " dne" << dendl;
pg_info_t empty(pgid);
if (it->second.type == pg_query_t::LOG ||
Expand Down Expand Up @@ -6751,7 +6746,7 @@ void OSD::check_replay_queue()
dout(10) << "check_replay_queue " << *pg << dendl;
if (pg->is_active() &&
pg->is_replay() &&
pg->get_role() == 0 &&
pg->is_primary() &&
pg->replay_until == p->second) {
pg->replay_queued_ops();
}
Expand Down

0 comments on commit 3b696f2

Please sign in to comment.