Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osd: bluestore: fix space rebalancing, collection split, buffered reads #7196

Merged
merged 11 commits into from
Jan 14, 2016
11 changes: 6 additions & 5 deletions src/common/config_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -849,11 +849,11 @@ OPTION(bluefs_min_flush_size, OPT_U64, 65536) // ignore flush until its this bi

OPTION(bluestore_bluefs, OPT_BOOL, true)
OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug
OPTION(bluestore_bluefs_initial_length, OPT_U64, 65536*1024)
OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .01)
OPTION(bluestore_bluefs_min_free_ratio, OPT_FLOAT, .1)
OPTION(bluestore_bluefs_max_free_fs_main_ratio, OPT_FLOAT, .8)
OPTION(bluestore_bluefs_min_gift_ratio, OPT_FLOAT, 1)
OPTION(bluestore_bluefs_min, OPT_U64, 1*1024*1024*1024) // 1gb
OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .02) // min fs free / total free
OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT, .90) // max fs free / total free
OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT, .02) // how much to add at a time
OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT, .20) // how much to reclaim at a time
OPTION(bluestore_block_path, OPT_STR, "")
OPTION(bluestore_block_size, OPT_U64, 10 * 1024*1024*1024) // 10gb for testing
OPTION(bluestore_block_db_path, OPT_STR, "")
Expand Down Expand Up @@ -887,6 +887,7 @@ OPTION(bluestore_overlay_max, OPT_INT, 0)
OPTION(bluestore_open_by_handle, OPT_BOOL, true)
OPTION(bluestore_o_direct, OPT_BOOL, true)
OPTION(bluestore_clone_cow, OPT_BOOL, true) // do copy-on-write for clones
OPTION(bluestore_default_buffered_read, OPT_BOOL, false)
OPTION(bluestore_debug_misc, OPT_BOOL, false)
OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL, false)
OPTION(bluestore_debug_small_allocations, OPT_INT, 0)
Expand Down
9 changes: 9 additions & 0 deletions src/os/bluestore/BlockDevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ int BlockDevice::open(string p)
assert(0 == "non-aio not supported");
}

// disable readahead as it will wreak havoc on our mix of
// directio/aio and buffered io.
r = posix_fadvise(fd_buffered, 0, 0, POSIX_FADV_RANDOM);
if (r < 0) {
r = -errno;
derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
goto out_fail;
}

r = _lock();
if (r < 0) {
derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
Expand Down
24 changes: 24 additions & 0 deletions src/os/bluestore/BlueFS.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,30 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
dout(10) << __func__ << " done" << dendl;
}

int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
uint64_t *offset, uint32_t *length)
{
dout(1) << __func__ << " bdev " << id << " want " << want << dendl;
assert(id < alloc.size());
int r = alloc[id]->reserve(want);
assert(r == 0); // caller shouldn't ask for more than they can get

r = alloc[id]->allocate(want, g_conf->bluefs_alloc_size, 0,
offset, length);
assert(r >= 0);
if (*length < want)
alloc[id]->unreserve(want - *length);

block_all[id].erase(*offset, *length);
log_t.op_alloc_rm(id, *offset, *length);
r = _flush_log();
assert(r == 0);

dout(1) << __func__ << " bdev " << id << " want " << want
<< " got " << *offset << "~" << *length << dendl;
return 0;
}

uint64_t BlueFS::get_total(unsigned id)
{
Mutex::Locker l(lock);
Expand Down
4 changes: 4 additions & 0 deletions src/os/bluestore/BlueFS.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ class BlueFS {
/// gift more block space
void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len);

/// reclaim block space
int reclaim_blocks(unsigned bdev, uint64_t want,
uint64_t *offset, uint32_t *length);

void flush(FileWriter *h) {
Mutex::Locker l(lock);
_flush(h, false);
Expand Down
162 changes: 101 additions & 61 deletions src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ int BlueStore::_open_alloc()

alloc = Allocator::create("stupid");
uint64_t num = 0, bytes = 0;
const map<uint64_t,uint64_t>& fl = fm->get_freelist();
const auto& fl = fm->get_freelist();
for (auto& p : fl) {
alloc->init_add_free(p.first, p.second);
++num;
Expand Down Expand Up @@ -1181,8 +1181,15 @@ int BlueStore::_open_db(bool create)
if (create) {
// note: we might waste a 4k block here if block.db is used, but it's
// simpler.
bluefs->add_block_extent(id, BLUEFS_START,
g_conf->bluestore_bluefs_initial_length);
uint64_t initial =
bdev->get_size() * (g_conf->bluestore_bluefs_min_ratio +
g_conf->bluestore_bluefs_gift_ratio);
initial = MAX(initial, g_conf->bluestore_bluefs_min);
// align to bluefs's alloc_size
initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size);
initial += g_conf->bluefs_alloc_size - BLUEFS_START;
bluefs->add_block_extent(id, BLUEFS_START, initial);
bluefs_extents.insert(BLUEFS_START, initial);
}
bluefs_shared_bdev = id;
++id;
Expand Down Expand Up @@ -1362,7 +1369,8 @@ int BlueStore::_reconcile_bluefs_freespace()
return 0;
}

int BlueStore::_balance_bluefs_freespace(vector<bluestore_extent_t> *extents)
int BlueStore::_balance_bluefs_freespace(vector<bluestore_extent_t> *extents,
KeyValueDB::Transaction t)
{
int ret = 0;
assert(bluefs);
Expand All @@ -1380,73 +1388,97 @@ int BlueStore::_balance_bluefs_freespace(vector<bluestore_extent_t> *extents)
uint64_t total = bdev->get_size();
float my_free_ratio = (float)my_free / (float)total;

dout(10) << __func__ << " bluefs " << pretty_si_t(bluefs_free)
<< " free of " << pretty_si_t(bluefs_total)
<< " free_ratio " << bluefs_free_ratio << dendl;
dout(10) << __func__ << " bluestore " << pretty_si_t(my_free)
<< " free of " << pretty_si_t(total)
<< " free_ratio " << my_free_ratio << dendl;
uint64_t total_free = bluefs_free + my_free;

float bluefs_ratio = (float)bluefs_free / (float)total_free;

dout(10) << __func__
<< " bluefs " << pretty_si_t(bluefs_free)
<< " free (" << bluefs_free_ratio
<< ") bluestore " << pretty_si_t(my_free)
<< " free (" << my_free_ratio
<< "), bluefs_ratio " << bluefs_ratio
<< dendl;

uint64_t gift = 0;
if (bluefs_free_ratio < g_conf->bluestore_bluefs_min_free_ratio &&
bluefs_free_ratio < my_free_ratio) {
// give it more
gift = g_conf->bluestore_bluefs_min_free_ratio * bluefs_total;
dout(10) << __func__ << " bluefs_free_ratio " << bluefs_free_ratio
<< " < min_free_ratio " << g_conf->bluestore_bluefs_min_free_ratio
<< ", should gift " << pretty_si_t(gift) << dendl;
}
float bluefs_ratio = (float)bluefs_total / (float)total;
uint64_t reclaim = 0;
if (bluefs_ratio < g_conf->bluestore_bluefs_min_ratio) {
uint64_t g = total * g_conf->bluestore_bluefs_min_ratio;
gift = g_conf->bluestore_bluefs_gift_ratio * total_free;
dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
<< " < min_ratio " << g_conf->bluestore_bluefs_min_ratio
<< ", should gift " << pretty_si_t(gift) << dendl;
} else if (bluefs_ratio > g_conf->bluestore_bluefs_max_ratio) {
reclaim = g_conf->bluestore_bluefs_reclaim_ratio * total_free;
if (bluefs_total - reclaim < g_conf->bluestore_bluefs_min)
reclaim = bluefs_total - g_conf->bluestore_bluefs_min;
dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
<< " > max_ratio " << g_conf->bluestore_bluefs_max_ratio
<< ", should reclaim " << pretty_si_t(reclaim) << dendl;
}
if (bluefs_total < g_conf->bluestore_bluefs_min) {
uint64_t g = g_conf->bluestore_bluefs_min;
dout(10) << __func__ << " bluefs_total " << bluefs_total
<< " < min " << g_conf->bluestore_bluefs_min
<< ", should gift " << pretty_si_t(g) << dendl;
if (g > gift)
gift = g;
reclaim = 0;
}

float fs_main_ratio = (float)bluefs_free / (float)my_free;
dout(10) << __func__ << " fs:main free ratio " << fs_main_ratio << dendl;

if (gift) {
float gift_ratio = (float)gift / (float)bluefs_free;
if (gift_ratio < g_conf->bluestore_bluefs_min_gift_ratio) {
dout(10) << __func__ << " proposed gift of " << pretty_si_t(gift)
<< " gift_ratio " << gift_ratio
<< " < min_gift_ratio " << g_conf->bluestore_bluefs_min_gift_ratio
<< dendl;
} else {
// round up to alloc size
uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size;
gift = ROUND_UP_TO(gift, min_alloc_size);
// round up to alloc size
uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size;
gift = ROUND_UP_TO(gift, min_alloc_size);

// hard cap to fit into 32 bits
gift = MIN(gift, 1ull<<31);
dout(10) << __func__ << " gifting " << gift
// hard cap to fit into 32 bits
gift = MIN(gift, 1ull<<31);
dout(10) << __func__ << " gifting " << gift
<< " (" << pretty_si_t(gift) << ")" << dendl;

// fixme: just do one allocation to start...
int r = alloc->reserve(gift);
assert(r == 0);

bluestore_extent_t e;
r = alloc->allocate(gift, min_alloc_size, 0, &e.offset, &e.length);
if (r < 0) {
assert(0 == "allocate failed, wtf");
return r;
}
if (e.length < gift) {
alloc->unreserve(gift - e.length);
}
// fixme: just do one allocation to start...
int r = alloc->reserve(gift);
assert(r == 0);

dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
extents->push_back(e);
ret = 1;
bluestore_extent_t e;
r = alloc->allocate(gift, min_alloc_size, 0, &e.offset, &e.length);
if (r < 0) {
assert(0 == "allocate failed, wtf");
return r;
}
if (e.length < gift) {
alloc->unreserve(gift - e.length);
}

dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
extents->push_back(e);
ret = 1;
}

// FIXME: reclaim from bluefs?
// reclaim from bluefs?
if (reclaim) {
// round up to alloc size
uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size;
reclaim = ROUND_UP_TO(reclaim, min_alloc_size);

// hard cap to fit into 32 bits
reclaim = MIN(reclaim, 1ull<<31);
dout(10) << __func__ << " reclaiming " << reclaim
<< " (" << pretty_si_t(reclaim) << ")" << dendl;

uint64_t offset = 0;
uint32_t length = 0;

// NOTE: this will block and do IO.
int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
&offset, &length);
assert(r >= 0);

bluefs_extents.erase(offset, length);

fm->release(offset, length, t);
alloc->release(offset, length);
ret = 1;
}

return ret;
}
Expand Down Expand Up @@ -1593,11 +1625,10 @@ int BlueStore::mkfs()
KeyValueDB::Transaction t = db->get_transaction();
uint64_t reserved = 0;
if (g_conf->bluestore_bluefs) {
reserved = BLUEFS_START + g_conf->bluestore_bluefs_initial_length;
dout(20) << __func__ << " reserved first " << reserved
<< " bytes for bluefs" << dendl;
bluefs_extents.insert(BLUEFS_START,
g_conf->bluestore_bluefs_initial_length);
assert(bluefs_extents.num_intervals() == 1);
interval_set<uint64_t>::iterator p = bluefs_extents.begin();
reserved = p.get_start() + p.get_len();
dout(20) << __func__ << " reserved " << reserved << " for bluefs" << dendl;
bufferlist bl;
::encode(bluefs_extents, bl);
t->set(PREFIX_SUPER, "bluefs_extents", bl);
Expand Down Expand Up @@ -2126,8 +2157,8 @@ int BlueStore::fsck()

dout(1) << __func__ << " checking freelist vs allocated" << dendl;
{
const map<uint64_t,uint64_t>& free = fm->get_freelist();
for (map<uint64_t,uint64_t>::const_iterator p = free.begin();
const auto& free = fm->get_freelist();
for (auto p = free.begin();
p != free.end(); ++p) {
if (used_blocks.intersects(p->first, p->second)) {
derr << __func__ << " free extent " << p->first << "~" << p->second
Expand Down Expand Up @@ -2342,6 +2373,11 @@ int BlueStore::_do_read(
if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
dout(20) << __func__ << " will do buffered read" << dendl;
buffered = true;
} else if (g_conf->bluestore_default_buffered_read &&
(op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
dout(20) << __func__ << " defaulting to buffered read" << dendl;
buffered = true;
}

dout(20) << __func__ << " " << offset << "~" << length << " size "
Expand Down Expand Up @@ -3523,7 +3559,7 @@ void BlueStore::_kv_sync_thread()

vector<bluestore_extent_t> bluefs_gift_extents;
if (bluefs) {
int r = _balance_bluefs_freespace(&bluefs_gift_extents);
int r = _balance_bluefs_freespace(&bluefs_gift_extents, t);
assert(r >= 0);
if (r > 0) {
for (auto& p : bluefs_gift_extents) {
Expand Down Expand Up @@ -6128,6 +6164,10 @@ int BlueStore::_split_collection(TransContext *txc,
assert(d->cnode.bits == bits);
r = 0;

bufferlist bl;
::encode(c->cnode, bl);
txc->t->set(PREFIX_COLL, stringify(c->cid), bl);

dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
<< " bits " << bits << " = " << r << dendl;
return r;
Expand Down
3 changes: 2 additions & 1 deletion src/os/bluestore/BlueStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,8 @@ class BlueStore : public ObjectStore {
int _open_super_meta();

int _reconcile_bluefs_freespace();
int _balance_bluefs_freespace(vector<bluestore_extent_t> *extents);
int _balance_bluefs_freespace(vector<bluestore_extent_t> *extents,
KeyValueDB::Transaction t);
void _commit_bluefs_freespace(const vector<bluestore_extent_t>& extents);

CollectionRef _get_collection(coll_t cid);
Expand Down
17 changes: 11 additions & 6 deletions src/os/bluestore/FreelistManager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,14 @@ void FreelistManager::shutdown()
void FreelistManager::dump()
{
Mutex::Locker l(lock);
_dump();
}

void FreelistManager::_dump()
{
dout(30) << __func__ << " " << total_free
<< " in " << kv_free.size() << " extents" << dendl;
for (std::map<uint64_t,uint64_t>::iterator p = kv_free.begin();
for (auto p = kv_free.begin();
p != kv_free.end();
++p) {
dout(30) << __func__ << " " << p->first << "~" << p->second << dendl;
Expand Down Expand Up @@ -109,7 +114,7 @@ int FreelistManager::allocate(
Mutex::Locker l(lock);
dout(10) << __func__ << " " << offset << "~" << length << dendl;
total_free -= length;
map<uint64_t,uint64_t>::iterator p = kv_free.lower_bound(offset);
auto p = kv_free.lower_bound(offset);
if ((p == kv_free.end() || p->first > offset) &&
p != kv_free.begin()) {
--p;
Expand All @@ -121,7 +126,7 @@ int FreelistManager::allocate(
if (p != kv_free.end()) {
derr << " existing extent " << p->first << "~" << p->second << dendl;
}
dump();
_dump();
assert(0 == "bad allocate");
}

Expand Down Expand Up @@ -182,7 +187,7 @@ int FreelistManager::release(
Mutex::Locker l(lock);
dout(10) << __func__ << " " << offset << "~" << length << dendl;
total_free += length;
map<uint64_t,uint64_t>::iterator p = kv_free.lower_bound(offset);
auto p = kv_free.lower_bound(offset);

// contiguous with previous extent?
if (p != kv_free.begin()) {
Expand All @@ -199,7 +204,7 @@ int FreelistManager::release(
} else if (p->first + p->second > offset) {
derr << __func__ << " bad release " << offset << "~" << length
<< " overlaps with " << p->first << "~" << p->second << dendl;
dump();
_dump();
assert(0 == "bad release overlap");
} else {
dout(30) << __func__ << " previous extent " << p->first << "~" << p->second
Expand All @@ -221,7 +226,7 @@ int FreelistManager::release(
} else if (p->first < offset + length) {
derr << __func__ << " bad release " << offset << "~" << length
<< " overlaps with " << p->first << "~" << p->second << dendl;
dump();
_dump();
assert(0 == "bad release overlap");
} else {
dout(30) << __func__ << " next extent " << p->first << "~" << p->second
Expand Down