Skip to content

Commit

Permalink
Merge pull request #7196 from liewegas/wip-bluestore
Browse files Browse the repository at this point in the history
osd: bluestore: fix space rebalancing, collection split, buffered reads
  • Loading branch information
liewegas committed Jan 14, 2016
2 parents 1c85998 + e92cbde commit 7241bf3
Show file tree
Hide file tree
Showing 11 changed files with 173 additions and 81 deletions.
11 changes: 6 additions & 5 deletions src/common/config_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -853,11 +853,11 @@ OPTION(bluefs_min_flush_size, OPT_U64, 65536) // ignore flush until its this bi

OPTION(bluestore_bluefs, OPT_BOOL, true)
OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug
OPTION(bluestore_bluefs_initial_length, OPT_U64, 65536*1024)
OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .01)
OPTION(bluestore_bluefs_min_free_ratio, OPT_FLOAT, .1)
OPTION(bluestore_bluefs_max_free_fs_main_ratio, OPT_FLOAT, .8)
OPTION(bluestore_bluefs_min_gift_ratio, OPT_FLOAT, 1)
OPTION(bluestore_bluefs_min, OPT_U64, 1*1024*1024*1024) // 1gb
OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .02) // min fs free / total free
OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT, .90) // max fs free / total free
OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT, .02) // how much to add at a time
OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT, .20) // how much to reclaim at a time
OPTION(bluestore_block_path, OPT_STR, "")
OPTION(bluestore_block_size, OPT_U64, 10 * 1024*1024*1024) // 10gb for testing
OPTION(bluestore_block_db_path, OPT_STR, "")
Expand Down Expand Up @@ -891,6 +891,7 @@ OPTION(bluestore_overlay_max, OPT_INT, 0)
OPTION(bluestore_open_by_handle, OPT_BOOL, true)
OPTION(bluestore_o_direct, OPT_BOOL, true)
OPTION(bluestore_clone_cow, OPT_BOOL, true) // do copy-on-write for clones
OPTION(bluestore_default_buffered_read, OPT_BOOL, false)
OPTION(bluestore_debug_misc, OPT_BOOL, false)
OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL, false)
OPTION(bluestore_debug_small_allocations, OPT_INT, 0)
Expand Down
9 changes: 9 additions & 0 deletions src/os/bluestore/BlockDevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ int BlockDevice::open(string p)
assert(0 == "non-aio not supported");
}

// disable readahead as it will wreak havoc on our mix of
// directio/aio and buffered io.
r = posix_fadvise(fd_buffered, 0, 0, POSIX_FADV_RANDOM);
if (r < 0) {
r = -errno;
derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
goto out_fail;
}

r = _lock();
if (r < 0) {
derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
Expand Down
24 changes: 24 additions & 0 deletions src/os/bluestore/BlueFS.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,30 @@ void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
dout(10) << __func__ << " done" << dendl;
}

int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
uint64_t *offset, uint32_t *length)
{
dout(1) << __func__ << " bdev " << id << " want " << want << dendl;
assert(id < alloc.size());
int r = alloc[id]->reserve(want);
assert(r == 0); // caller shouldn't ask for more than they can get

r = alloc[id]->allocate(want, g_conf->bluefs_alloc_size, 0,
offset, length);
assert(r >= 0);
if (*length < want)
alloc[id]->unreserve(want - *length);

block_all[id].erase(*offset, *length);
log_t.op_alloc_rm(id, *offset, *length);
r = _flush_log();
assert(r == 0);

dout(1) << __func__ << " bdev " << id << " want " << want
<< " got " << *offset << "~" << *length << dendl;
return 0;
}

uint64_t BlueFS::get_total(unsigned id)
{
Mutex::Locker l(lock);
Expand Down
4 changes: 4 additions & 0 deletions src/os/bluestore/BlueFS.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ class BlueFS {
/// gift more block space
void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len);

/// reclaim block space
int reclaim_blocks(unsigned bdev, uint64_t want,
uint64_t *offset, uint32_t *length);

void flush(FileWriter *h) {
Mutex::Locker l(lock);
_flush(h, false);
Expand Down
162 changes: 101 additions & 61 deletions src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ int BlueStore::_open_alloc()

alloc = Allocator::create("stupid");
uint64_t num = 0, bytes = 0;
const map<uint64_t,uint64_t>& fl = fm->get_freelist();
const auto& fl = fm->get_freelist();
for (auto& p : fl) {
alloc->init_add_free(p.first, p.second);
++num;
Expand Down Expand Up @@ -1181,8 +1181,15 @@ int BlueStore::_open_db(bool create)
if (create) {
// note: we might waste a 4k block here if block.db is used, but it's
// simpler.
bluefs->add_block_extent(id, BLUEFS_START,
g_conf->bluestore_bluefs_initial_length);
uint64_t initial =
bdev->get_size() * (g_conf->bluestore_bluefs_min_ratio +
g_conf->bluestore_bluefs_gift_ratio);
initial = MAX(initial, g_conf->bluestore_bluefs_min);
// align to bluefs's alloc_size
initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size);
initial += g_conf->bluefs_alloc_size - BLUEFS_START;
bluefs->add_block_extent(id, BLUEFS_START, initial);
bluefs_extents.insert(BLUEFS_START, initial);
}
bluefs_shared_bdev = id;
++id;
Expand Down Expand Up @@ -1362,7 +1369,8 @@ int BlueStore::_reconcile_bluefs_freespace()
return 0;
}

int BlueStore::_balance_bluefs_freespace(vector<bluestore_extent_t> *extents)
int BlueStore::_balance_bluefs_freespace(vector<bluestore_extent_t> *extents,
KeyValueDB::Transaction t)
{
int ret = 0;
assert(bluefs);
Expand All @@ -1380,73 +1388,97 @@ int BlueStore::_balance_bluefs_freespace(vector<bluestore_extent_t> *extents)
uint64_t total = bdev->get_size();
float my_free_ratio = (float)my_free / (float)total;

dout(10) << __func__ << " bluefs " << pretty_si_t(bluefs_free)
<< " free of " << pretty_si_t(bluefs_total)
<< " free_ratio " << bluefs_free_ratio << dendl;
dout(10) << __func__ << " bluestore " << pretty_si_t(my_free)
<< " free of " << pretty_si_t(total)
<< " free_ratio " << my_free_ratio << dendl;
uint64_t total_free = bluefs_free + my_free;

float bluefs_ratio = (float)bluefs_free / (float)total_free;

dout(10) << __func__
<< " bluefs " << pretty_si_t(bluefs_free)
<< " free (" << bluefs_free_ratio
<< ") bluestore " << pretty_si_t(my_free)
<< " free (" << my_free_ratio
<< "), bluefs_ratio " << bluefs_ratio
<< dendl;

uint64_t gift = 0;
if (bluefs_free_ratio < g_conf->bluestore_bluefs_min_free_ratio &&
bluefs_free_ratio < my_free_ratio) {
// give it more
gift = g_conf->bluestore_bluefs_min_free_ratio * bluefs_total;
dout(10) << __func__ << " bluefs_free_ratio " << bluefs_free_ratio
<< " < min_free_ratio " << g_conf->bluestore_bluefs_min_free_ratio
<< ", should gift " << pretty_si_t(gift) << dendl;
}
float bluefs_ratio = (float)bluefs_total / (float)total;
uint64_t reclaim = 0;
if (bluefs_ratio < g_conf->bluestore_bluefs_min_ratio) {
uint64_t g = total * g_conf->bluestore_bluefs_min_ratio;
gift = g_conf->bluestore_bluefs_gift_ratio * total_free;
dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
<< " < min_ratio " << g_conf->bluestore_bluefs_min_ratio
<< ", should gift " << pretty_si_t(gift) << dendl;
} else if (bluefs_ratio > g_conf->bluestore_bluefs_max_ratio) {
reclaim = g_conf->bluestore_bluefs_reclaim_ratio * total_free;
if (bluefs_total - reclaim < g_conf->bluestore_bluefs_min)
reclaim = bluefs_total - g_conf->bluestore_bluefs_min;
dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio
<< " > max_ratio " << g_conf->bluestore_bluefs_max_ratio
<< ", should reclaim " << pretty_si_t(reclaim) << dendl;
}
if (bluefs_total < g_conf->bluestore_bluefs_min) {
uint64_t g = g_conf->bluestore_bluefs_min;
dout(10) << __func__ << " bluefs_total " << bluefs_total
<< " < min " << g_conf->bluestore_bluefs_min
<< ", should gift " << pretty_si_t(g) << dendl;
if (g > gift)
gift = g;
reclaim = 0;
}

float fs_main_ratio = (float)bluefs_free / (float)my_free;
dout(10) << __func__ << " fs:main free ratio " << fs_main_ratio << dendl;

if (gift) {
float gift_ratio = (float)gift / (float)bluefs_free;
if (gift_ratio < g_conf->bluestore_bluefs_min_gift_ratio) {
dout(10) << __func__ << " proposed gift of " << pretty_si_t(gift)
<< " gift_ratio " << gift_ratio
<< " < min_gift_ratio " << g_conf->bluestore_bluefs_min_gift_ratio
<< dendl;
} else {
// round up to alloc size
uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size;
gift = ROUND_UP_TO(gift, min_alloc_size);
// round up to alloc size
uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size;
gift = ROUND_UP_TO(gift, min_alloc_size);

// hard cap to fit into 32 bits
gift = MIN(gift, 1ull<<31);
dout(10) << __func__ << " gifting " << gift
// hard cap to fit into 32 bits
gift = MIN(gift, 1ull<<31);
dout(10) << __func__ << " gifting " << gift
<< " (" << pretty_si_t(gift) << ")" << dendl;

// fixme: just do one allocation to start...
int r = alloc->reserve(gift);
assert(r == 0);

bluestore_extent_t e;
r = alloc->allocate(gift, min_alloc_size, 0, &e.offset, &e.length);
if (r < 0) {
assert(0 == "allocate failed, wtf");
return r;
}
if (e.length < gift) {
alloc->unreserve(gift - e.length);
}
// fixme: just do one allocation to start...
int r = alloc->reserve(gift);
assert(r == 0);

dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
extents->push_back(e);
ret = 1;
bluestore_extent_t e;
r = alloc->allocate(gift, min_alloc_size, 0, &e.offset, &e.length);
if (r < 0) {
assert(0 == "allocate failed, wtf");
return r;
}
if (e.length < gift) {
alloc->unreserve(gift - e.length);
}

dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
extents->push_back(e);
ret = 1;
}

// FIXME: reclaim from bluefs?
// reclaim from bluefs?
if (reclaim) {
// round up to alloc size
uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size;
reclaim = ROUND_UP_TO(reclaim, min_alloc_size);

// hard cap to fit into 32 bits
reclaim = MIN(reclaim, 1ull<<31);
dout(10) << __func__ << " reclaiming " << reclaim
<< " (" << pretty_si_t(reclaim) << ")" << dendl;

uint64_t offset = 0;
uint32_t length = 0;

// NOTE: this will block and do IO.
int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
&offset, &length);
assert(r >= 0);

bluefs_extents.erase(offset, length);

fm->release(offset, length, t);
alloc->release(offset, length);
ret = 1;
}

return ret;
}
Expand Down Expand Up @@ -1593,11 +1625,10 @@ int BlueStore::mkfs()
KeyValueDB::Transaction t = db->get_transaction();
uint64_t reserved = 0;
if (g_conf->bluestore_bluefs) {
reserved = BLUEFS_START + g_conf->bluestore_bluefs_initial_length;
dout(20) << __func__ << " reserved first " << reserved
<< " bytes for bluefs" << dendl;
bluefs_extents.insert(BLUEFS_START,
g_conf->bluestore_bluefs_initial_length);
assert(bluefs_extents.num_intervals() == 1);
interval_set<uint64_t>::iterator p = bluefs_extents.begin();
reserved = p.get_start() + p.get_len();
dout(20) << __func__ << " reserved " << reserved << " for bluefs" << dendl;
bufferlist bl;
::encode(bluefs_extents, bl);
t->set(PREFIX_SUPER, "bluefs_extents", bl);
Expand Down Expand Up @@ -2126,8 +2157,8 @@ int BlueStore::fsck()

dout(1) << __func__ << " checking freelist vs allocated" << dendl;
{
const map<uint64_t,uint64_t>& free = fm->get_freelist();
for (map<uint64_t,uint64_t>::const_iterator p = free.begin();
const auto& free = fm->get_freelist();
for (auto p = free.begin();
p != free.end(); ++p) {
if (used_blocks.intersects(p->first, p->second)) {
derr << __func__ << " free extent " << p->first << "~" << p->second
Expand Down Expand Up @@ -2342,6 +2373,11 @@ int BlueStore::_do_read(
if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
dout(20) << __func__ << " will do buffered read" << dendl;
buffered = true;
} else if (g_conf->bluestore_default_buffered_read &&
(op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
dout(20) << __func__ << " defaulting to buffered read" << dendl;
buffered = true;
}

dout(20) << __func__ << " " << offset << "~" << length << " size "
Expand Down Expand Up @@ -3523,7 +3559,7 @@ void BlueStore::_kv_sync_thread()

vector<bluestore_extent_t> bluefs_gift_extents;
if (bluefs) {
int r = _balance_bluefs_freespace(&bluefs_gift_extents);
int r = _balance_bluefs_freespace(&bluefs_gift_extents, t);
assert(r >= 0);
if (r > 0) {
for (auto& p : bluefs_gift_extents) {
Expand Down Expand Up @@ -6128,6 +6164,10 @@ int BlueStore::_split_collection(TransContext *txc,
assert(d->cnode.bits == bits);
r = 0;

bufferlist bl;
::encode(c->cnode, bl);
txc->t->set(PREFIX_COLL, stringify(c->cid), bl);

dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
<< " bits " << bits << " = " << r << dendl;
return r;
Expand Down
3 changes: 2 additions & 1 deletion src/os/bluestore/BlueStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,8 @@ class BlueStore : public ObjectStore {
int _open_super_meta();

int _reconcile_bluefs_freespace();
int _balance_bluefs_freespace(vector<bluestore_extent_t> *extents);
int _balance_bluefs_freespace(vector<bluestore_extent_t> *extents,
KeyValueDB::Transaction t);
void _commit_bluefs_freespace(const vector<bluestore_extent_t>& extents);

CollectionRef _get_collection(coll_t cid);
Expand Down
17 changes: 11 additions & 6 deletions src/os/bluestore/FreelistManager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,14 @@ void FreelistManager::shutdown()
void FreelistManager::dump()
{
Mutex::Locker l(lock);
_dump();
}

void FreelistManager::_dump()
{
dout(30) << __func__ << " " << total_free
<< " in " << kv_free.size() << " extents" << dendl;
for (std::map<uint64_t,uint64_t>::iterator p = kv_free.begin();
for (auto p = kv_free.begin();
p != kv_free.end();
++p) {
dout(30) << __func__ << " " << p->first << "~" << p->second << dendl;
Expand Down Expand Up @@ -109,7 +114,7 @@ int FreelistManager::allocate(
Mutex::Locker l(lock);
dout(10) << __func__ << " " << offset << "~" << length << dendl;
total_free -= length;
map<uint64_t,uint64_t>::iterator p = kv_free.lower_bound(offset);
auto p = kv_free.lower_bound(offset);
if ((p == kv_free.end() || p->first > offset) &&
p != kv_free.begin()) {
--p;
Expand All @@ -121,7 +126,7 @@ int FreelistManager::allocate(
if (p != kv_free.end()) {
derr << " existing extent " << p->first << "~" << p->second << dendl;
}
dump();
_dump();
assert(0 == "bad allocate");
}

Expand Down Expand Up @@ -182,7 +187,7 @@ int FreelistManager::release(
Mutex::Locker l(lock);
dout(10) << __func__ << " " << offset << "~" << length << dendl;
total_free += length;
map<uint64_t,uint64_t>::iterator p = kv_free.lower_bound(offset);
auto p = kv_free.lower_bound(offset);

// contiguous with previous extent?
if (p != kv_free.begin()) {
Expand All @@ -199,7 +204,7 @@ int FreelistManager::release(
} else if (p->first + p->second > offset) {
derr << __func__ << " bad release " << offset << "~" << length
<< " overlaps with " << p->first << "~" << p->second << dendl;
dump();
_dump();
assert(0 == "bad release overlap");
} else {
dout(30) << __func__ << " previous extent " << p->first << "~" << p->second
Expand All @@ -221,7 +226,7 @@ int FreelistManager::release(
} else if (p->first < offset + length) {
derr << __func__ << " bad release " << offset << "~" << length
<< " overlaps with " << p->first << "~" << p->second << dendl;
dump();
_dump();
assert(0 == "bad release overlap");
} else {
dout(30) << __func__ << " next extent " << p->first << "~" << p->second
Expand Down

0 comments on commit 7241bf3

Please sign in to comment.