Skip to content

Commit

Permalink
mon: prime pg_temp
Browse files Browse the repository at this point in the history
Prime pg_temp values for

 - any osd that goes up/down or has a reweight change
 - all osds on crush map change

We're ignoring primary_affinity and primary_temp at the moment.

No attempt is made (yet) to limit the time or CPU we burn doing this.

Signed-off-by: Sage Weil <sage@redhat.com>
(cherry picked from commit 7a1305b)
  • Loading branch information
liewegas authored and Robert LeBlanc committed Feb 29, 2016
1 parent eda25a1 commit 6ae2ae7
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/common/config_opts.h
Expand Up @@ -196,6 +196,7 @@ OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get conce
OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be set in the osdmap
OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false) // allow primary_affinity to be set in the osdmap
OPTION(mon_osd_prime_pg_temp, OPT_BOOL, false) // prime osdmap with pg mapping changes
OPTION(mon_stat_smooth_intervals, OPT_INT, 2) // smooth stats over last N PGMap maps
OPTION(mon_lease, OPT_FLOAT, 5) // lease interval
OPTION(mon_lease_renew_interval, OPT_FLOAT, 3) // on leader, to renew the lease
Expand Down
90 changes: 90 additions & 0 deletions src/mon/OSDMonitor.cc
Expand Up @@ -887,6 +887,93 @@ void OSDMonitor::create_pending()
OSDMap::remove_down_temps(g_ceph_context, osdmap, &pending_inc);
}

void OSDMonitor::maybe_prime_pg_temp()
{
if (pending_inc.crush.length()) {
dout(10) << __func__ << " new crush map" << dendl;
OSDMap next;
next.deepish_copy_from(osdmap);
next.apply_incremental(pending_inc);
prime_pg_temp(next, &mon->pgmon()->pg_map);
return;
}

// check for interesting OSDs
set<int> osds;
for (map<int32_t,uint8_t>::iterator p = pending_inc.new_state.begin();
p != pending_inc.new_state.end();
++p) {
if (p->second & CEPH_OSD_UP) {
osds.insert(p->first);
}
}
for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
p != pending_inc.new_weight.end();
++p) {
osds.insert(p->first);
}
if (!osds.empty()) {
dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
OSDMap next;
next.deepish_copy_from(osdmap);
next.apply_incremental(pending_inc);
for (set<int>::iterator p = osds.begin(); p != osds.end(); ++p) {
prime_pg_temp(next, &mon->pgmon()->pg_map, *p);
}
}
}

void OSDMonitor::prime_pg_temp(OSDMap& next,
ceph::unordered_map<pg_t, pg_stat_t>::iterator pp)
{
// do not touch a mapping if a change is pending
if (pending_inc.new_pg_temp.count(pp->first))
return;
vector<int> up, acting;
int up_primary, acting_primary;
next.pg_to_up_acting_osds(pp->first, &up, &up_primary, &acting, &acting_primary);
if (acting == pp->second.acting)
return; // no change since last pg update, skip
vector<int> cur_up, cur_acting;
osdmap.pg_to_up_acting_osds(pp->first, &cur_up, &up_primary,
&cur_acting, &acting_primary);
if (cur_acting == acting)
return; // no change this epoch; must be stale pg_stat

dout(20) << __func__ << " " << pp->first << " " << cur_up << "/" << cur_acting
<< " -> " << up << "/" << acting
<< ", priming " << cur_acting
<< dendl;
pending_inc.new_pg_temp[pp->first] = cur_acting;
}

void OSDMonitor::prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd)
{
dout(10) << __func__ << " osd." << osd << dendl;
ceph::unordered_map<int, set<pg_t> >::iterator po = pg_map->pg_by_osd.find(osd);
if (po != pg_map->pg_by_osd.end()) {
for (set<pg_t>::iterator p = po->second.begin();
p != po->second.end();
++p) {
ceph::unordered_map<pg_t, pg_stat_t>::iterator pp = pg_map->pg_stat.find(*p);
if (pp == pg_map->pg_stat.end())
continue;
prime_pg_temp(next, pp);
}
}
}

void OSDMonitor::prime_pg_temp(OSDMap& next, PGMap *pg_map)
{
dout(10) << __func__ << dendl;
for (ceph::unordered_map<pg_t, pg_stat_t>::iterator pp = pg_map->pg_stat.begin();
pp != pg_map->pg_stat.end();
++pp) {
prime_pg_temp(next, pp);
}
}


/**
* @note receiving a transaction in this function gives a fair amount of
* freedom to the service implementation if it does need it. It shouldn't.
Expand All @@ -902,6 +989,9 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
assert(r == 0);

if (g_conf->mon_osd_prime_pg_temp)
maybe_prime_pg_temp();

bufferlist bl;

// tell me about it
Expand Down
8 changes: 8 additions & 0 deletions src/mon/OSDMonitor.h
Expand Up @@ -35,6 +35,8 @@ using namespace std;
#include "Session.h"

class Monitor;
class PGMap;

#include "messages/MOSDBoot.h"
#include "messages/MMonCommand.h"
#include "messages/MOSDMap.h"
Expand Down Expand Up @@ -196,6 +198,12 @@ class OSDMonitor : public PaxosService {

void share_map_with_random_osd();

void maybe_prime_pg_temp();
void prime_pg_temp(OSDMap& next,
ceph::unordered_map<pg_t, pg_stat_t>::iterator pp);
void prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd);
void prime_pg_temp(OSDMap& next, PGMap *pg_map);

void update_logger();

void handle_query(PaxosServiceMessage *m);
Expand Down
1 change: 1 addition & 0 deletions src/vstart.sh
Expand Up @@ -420,6 +420,7 @@ $extra_conf
mon pg warn min per osd = 3
mon osd allow primary affinity = true
mon reweight min pgs per osd = 4
mon osd prime pg temp = true
$DAEMONOPTS
$CMONDEBUG
$extra_conf
Expand Down

0 comments on commit 6ae2ae7

Please sign in to comment.