Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mds: support export pinning on directories #14598

Merged
merged 38 commits into from
May 11, 2017
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c3e0aba
mds: order MDBalancer header
batrick Jan 12, 2017
a10cd1f
mds: remove unimplemented function
batrick Apr 11, 2017
36f3bc0
mds: organize Migrator headers
batrick Apr 11, 2017
2d2b387
mds: simplify loops to range-for
batrick Apr 11, 2017
9285b02
mds: add comment for subtrees MDCache member
batrick Apr 11, 2017
4bc458c
messages: add missing header to MMDSLoadTargets
batrick Apr 17, 2017
615f342
mds: organize headers
batrick Apr 17, 2017
efb11c6
mds: use uint64_t for sum of exported inodes
batrick Apr 17, 2017
5f49379
common/DecayCounter: remove redundant qualifiers
batrick Apr 18, 2017
b2f53b8
common/DecayCounter: add delta to value for ::get
batrick Apr 18, 2017
26a08f3
mds: check projected parent to avoid unsafe access
batrick Apr 18, 2017
3cfd46f
mds: dispatch export request
batrick Apr 18, 2017
89e990b
common: assoc. DecayRate with DecayCounter
batrick Apr 17, 2017
082e86c
mds: unify export_targets handling for all exports
batrick Apr 18, 2017
0b420be
mds: add export_pin feature
batrick Apr 11, 2017
df340e8
mds: prevent export of pinned inodes
batrick Apr 19, 2017
aebc1ca
mds: use projected parent to avoid unsafe access
batrick Apr 21, 2017
91952aa
mds: break ancestor walk if node is export_pinned
batrick Apr 25, 2017
0d6d320
mds: create auxiliary subtree inside auth subtree
ukernel Apr 27, 2017
649b38e
mds: simplify auxiliary subtrees when sending subtree resolve
ukernel Apr 28, 2017
7a76ea5
Revert "mds: prevent export of pinned inodes"
batrick Apr 28, 2017
6bd58fe
mds: use aux subtrees for export pinned inodes
batrick May 2, 2017
9ac8e2a
mds: handle imported directory that is pinned
batrick May 2, 2017
f1508ca
mds: use clearer name for export pin cont
batrick May 2, 2017
9755119
mds: cleanup aux subtree pin debug
batrick May 2, 2017
63cbe33
qa: remove errant mount requirement
batrick May 2, 2017
4cd4782
mds: check export_pin on dirfrag load
batrick May 3, 2017
41a44ae
mds: remove unnecessary check for parent pins
batrick May 4, 2017
3880c6d
mds: do not try to export pin special directories
batrick May 4, 2017
9552efd
qa: improve time handling for test_exports test
batrick May 5, 2017
383d521
mds: handle export pin on unjournaled directory
batrick May 5, 2017
048abc3
mds: call maybe_export_pin on all fetched dirfrags
batrick May 5, 2017
26dd9e8
mds: handle aux subtree when splitting/merging dirfrag
ukernel May 5, 2017
0c9c294
mds: check mdsdir against dest
batrick May 6, 2017
88e6db1
mds: update export targets even when not active
batrick May 9, 2017
6bdd9e7
mds: properly cleanup export states
ukernel May 9, 2017
68adc96
mds: fix dir auth calculation in CDir::merge
ukernel May 9, 2017
b6ba30b
mds: delay export until dir is stable
batrick May 10, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions src/mds/MDBalancer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -337,14 +337,10 @@ void MDBalancer::export_empties()
{
dout(5) << "export_empties checking for empty imports" << dendl;

for (map<CDir*,set<CDir*> >::iterator it = mds->mdcache->subtrees.begin();
it != mds->mdcache->subtrees.end();
++it) {
CDir *dir = it->first;
if (!dir->is_auth() ||
dir->is_ambiguous_auth() ||
dir->is_freezing() ||
dir->is_frozen())
std::set<CDir *> subtrees;
mds->mdcache->get_fullauth_subtrees(subtrees);
for (auto &dir : subtrees) {
if (dir->is_freezing() || dir->is_frozen())
continue;

if (!dir->inode->is_base() &&
Expand Down
146 changes: 76 additions & 70 deletions src/mds/MDBalancer.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ using std::map;
#include "include/types.h"
#include "common/Clock.h"
#include "common/Cond.h"
#include "CInode.h"


class MDSRank;
class Message;
Expand All @@ -37,49 +35,7 @@ class Messenger;
class MonClient;

class MDBalancer {
protected:
MDSRank *mds;
Messenger *messenger;
MonClient *mon_client;
int beat_epoch;

int last_epoch_under;
int last_epoch_over;
string bal_code;
string bal_version;

utime_t last_heartbeat;
utime_t last_sample;
utime_t rebalance_time; //ensure a consistent view of load for rebalance

// Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
// just as soon as a delayed context comes back and triggers it.
// These sets just prevent us from spawning extra timer contexts for
// dirfrags that already have one in flight.
set<dirfrag_t> split_pending, merge_pending;

// per-epoch scatter/gathered info
map<mds_rank_t, mds_load_t> mds_load;
map<mds_rank_t, double> mds_meta_load;
map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;

// per-epoch state
double my_load, target_load;
map<mds_rank_t,double> my_targets;
map<mds_rank_t,double> imported;
map<mds_rank_t,double> exported;

map<mds_rank_t, int> old_prev_targets; // # iterations they _haven't_ been targets
bool check_targets();

double try_match(mds_rank_t ex, double& maxex,
mds_rank_t im, double& maxim);
double get_maxim(mds_rank_t im) {
return target_load - mds_meta_load[im] - imported[im];
}
double get_maxex(mds_rank_t ex) {
return mds_meta_load[ex] - target_load - exported[ex];
}
friend class C_Bal_SendHeartbeat;

public:
MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
Expand All @@ -88,39 +44,32 @@ class MDBalancer {
mon_client(monc),
beat_epoch(0),
last_epoch_under(0), last_epoch_over(0), my_load(0.0), target_load(0.0) { }

mds_load_t get_load(utime_t);

int proc_message(Message *m);

int localize_balancer();
void send_heartbeat();
void handle_heartbeat(MHeartbeat *m);

/**
* Regularly called upkeep function.
*
* Sends MHeartbeat messages to the mons.
*/
void tick();

void export_empties();
//set up the rebalancing targets for export and do one if the
//MDSMap is up to date
void prep_rebalance(int beat);
int mantle_prep_rebalance();
/*check if the monitor has recorded the current export targets;
if it has then do the actual export. Otherwise send off our
export targets message again*/
/**
* Try to rebalance after receiving monitor mdsmap update.
*
* Check if the monitor has recorded the current export targets;
* if it has then do the actual export. Otherwise send off our
* export targets message again.
*/
void try_rebalance();
void find_exports(CDir *dir,
double amount,
list<CDir*>& exports,
double& have,
set<CDir*>& already_exporting);

void subtract_export(CDir *ex, utime_t now);
void add_import(CDir *im, utime_t now);

void subtract_export(class CDir *ex, utime_t now);
void add_import(class CDir *im, utime_t now);

void hit_inode(utime_t now, class CInode *in, int type, int who=-1);
void hit_dir(utime_t now, class CDir *dir, int type, int who=-1, double amount=1.0);
void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj);
void hit_inode(utime_t now, CInode *in, int type, int who=-1);
void hit_dir(utime_t now, CDir *dir, int type, int who=-1, double amount=1.0);

void queue_split(const CDir *dir, bool fast);
void queue_merge(CDir *dir);
Expand All @@ -132,8 +81,65 @@ class MDBalancer {
* \param hot whether the directory's temperature is enough to split it
*/
void maybe_fragment(CDir *dir, bool hot);
};

private:
//set up the rebalancing targets for export and do one if the
//MDSMap is up to date
void prep_rebalance(int beat);
int mantle_prep_rebalance();

void export_empties();
int localize_balancer();
bool check_targets();
void send_heartbeat();
void handle_heartbeat(MHeartbeat *m);
void find_exports(CDir *dir,
double amount,
list<CDir*>& exports,
double& have,
set<CDir*>& already_exporting);

double try_match(mds_rank_t ex, double& maxex,
mds_rank_t im, double& maxim);
double get_maxim(mds_rank_t im) {
return target_load - mds_meta_load[im] - imported[im];
}
double get_maxex(mds_rank_t ex) {
return mds_meta_load[ex] - target_load - exported[ex];
}

MDSRank *mds;
Messenger *messenger;
MonClient *mon_client;
int beat_epoch;

int last_epoch_under;
int last_epoch_over;
string bal_code;
string bal_version;

utime_t last_heartbeat;
utime_t last_sample;
utime_t rebalance_time; //ensure a consistent view of load for rebalance

// Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
// just as soon as a delayed context comes back and triggers it.
// These sets just prevent us from spawning extra timer contexts for
// dirfrags that already have one in flight.
set<dirfrag_t> split_pending, merge_pending;

// per-epoch scatter/gathered info
map<mds_rank_t, mds_load_t> mds_load;
map<mds_rank_t, double> mds_meta_load;
map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;

// per-epoch state
double my_load, target_load;
map<mds_rank_t,double> my_targets;
map<mds_rank_t,double> imported;
map<mds_rank_t,double> exported;

map<mds_rank_t, int> old_prev_targets; // # iterations they _haven't_ been targets
};

#endif
22 changes: 8 additions & 14 deletions src/mds/MDCache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1257,19 +1257,15 @@ void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
if (bounds != subtrees[dir]) {
dout(0) << "verify_subtree_bounds failed" << dendl;
set<CDir*> b = bounds;
for (set<CDir*>::iterator p = subtrees[dir].begin();
p != subtrees[dir].end();
++p) {
if (bounds.count(*p)) {
b.erase(*p);
for (auto &cd : subtrees[dir]) {
if (bounds.count(cd)) {
b.erase(cd);
continue;
}
dout(0) << " missing bound " << **p << dendl;
dout(0) << " missing bound " << *cd << dendl;
}
for (set<CDir*>::iterator p = b.begin();
p != b.end();
++p)
dout(0) << " extra bound " << **p << dendl;
for (const auto &cd : b)
dout(0) << " extra bound " << *cd << dendl;
}
assert(bounds == subtrees[dir]);
}
Expand All @@ -1281,10 +1277,8 @@ void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)

// make sure that any bounds i do have are properly noted as such.
int failed = 0;
for (list<dirfrag_t>::const_iterator p = bounds.begin();
p != bounds.end();
++p) {
CDir *bd = get_dirfrag(*p);
for (const auto &fg : bounds) {
CDir *bd = get_dirfrag(fg);
if (!bd) continue;
if (subtrees[dir].count(bd) == 0) {
dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
Expand Down
3 changes: 2 additions & 1 deletion src/mds/MDCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@ class MDCache {

// -- subtrees --
protected:
map<CDir*,set<CDir*> > subtrees; // nested bounds on subtrees.
/* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
map<CDir*,set<CDir*> > subtrees;
map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir

// adjust subtree auth specification
Expand Down
2 changes: 1 addition & 1 deletion src/mds/MDSRank.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "messages/MClientRequestForward.h"
#include "messages/MMDSMap.h"
#include "messages/MMDSTableRequest.h"
#include "messages/MCommand.h"
#include "messages/MCommandReply.h"

Expand All @@ -25,7 +26,6 @@
#include "SnapClient.h"
#include "SnapServer.h"
#include "MDBalancer.h"
#include "messages/MMDSTableRequest.h"
#include "Locker.h"
#include "Server.h"
#include "InoTable.h"
Expand Down
11 changes: 4 additions & 7 deletions src/mds/Migrator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1269,7 +1269,7 @@ void Migrator::export_go_synced(CDir *dir, uint64_t tid)
// fill export message with cache data
MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
map<client_t,entity_inst_t> exported_client_map;
int num_exported_inodes = encode_export_dir(req->export_data,
uint64_t num_exported_inodes = encode_export_dir(req->export_data,
dir, // recur start point
exported_client_map,
now);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank to the dirfrag size and MExportDir message size limitation, it's unlikely to overflow

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay to leave it as uint64_t?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

Expand Down Expand Up @@ -1430,12 +1430,12 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,

}

int Migrator::encode_export_dir(bufferlist& exportbl,
uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
CDir *dir,
map<client_t,entity_inst_t>& exported_client_map,
utime_t now)
{
int num_exported = 0;
uint64_t num_exported = 0;

dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;

Expand Down Expand Up @@ -1700,10 +1700,7 @@ void Migrator::export_reverse(CDir *dir)
}

// unpin bounds
for (set<CDir*>::iterator p = bounds.begin();
p != bounds.end();
++p) {
CDir *bd = *p;
for (const auto &bd : bounds) {
bd->put(CDir::PIN_EXPORTBOUND);
bd->state_clear(CDir::STATE_EXPORTBOUND);
}
Expand Down
Loading