Skip to content

Commit

Permalink
wip buffer cache
Browse files Browse the repository at this point in the history
  • Loading branch information
liewegas committed May 16, 2016
1 parent 922d16c commit b9ac31a
Show file tree
Hide file tree
Showing 2 changed files with 218 additions and 1 deletion.
24 changes: 23 additions & 1 deletion src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3794,7 +3794,8 @@ BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
TransContext *txc = new TransContext(osr);
txc->t = db->get_transaction();
osr->queue_new(txc);
dout(20) << __func__ << " osr " << osr << " = " << txc << dendl;
dout(20) << __func__ << " osr " << osr << " = " << txc
<< " seq " << txc->seq << dendl;
return txc;
}

Expand Down Expand Up @@ -3846,6 +3847,9 @@ void BlueStore::_txc_state_proc(TransContext *txc)
//assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
txc->log_state_latency(logger, l_bluestore_state_io_done_lat);
txc->state = TransContext::STATE_KV_QUEUED;
for (auto& o : txc->onodes) {
o->bc.finish_write(txc->seq);
}
if (!g_conf->bluestore_sync_transaction) {
if (g_conf->bluestore_sync_submit_transaction) {
_txc_finalize_kv(txc, txc->t);
Expand Down Expand Up @@ -4966,6 +4970,17 @@ void BlueStore::_dump_onode(OnodeRef o, int log_level)
dout(log_level) << __func__ << " overlay_refs " << o->onode.overlay_refs
<< dendl;
}
if (!o->bc.empty()) {
dout(log_level) << __func__ << " buffer_cache size 0x" << std::hex
<< o->bc.size << std::dec << dendl;
for (auto& i : o->bc.buffer_map) {
dout(log_level) << __func__ << " 0x" << std::hex << i.first << "~0x"
<< i.second->length << std::dec
<< " seq " << i.second->seq
<< " " << Buffer::get_state_name(i.second->state)
<< dendl;
}
}
if (o->tail_bl.length()) {
dout(log_level) << __func__ << " tail offset 0x" << std::hex << o->tail_offset
<< " len 0x" << o->tail_bl.length() << std::dec
Expand Down Expand Up @@ -5564,6 +5579,9 @@ int BlueStore::_do_write(
wctx.buffered = true;
}

// write in buffer cache
o->bc.write(txc->seq, offset, bl);

This comment has been minimized.

Copy link
@ifed01

ifed01 May 16, 2016

Contributor

As you have length parameter for the _do_write method that IMHO allows to write just a part of bl you should probably pass it down to bc.write too. And discard/add_buffer accordingly

This comment has been minimized.

Copy link
@liewegas

liewegas via email May 16, 2016

Author Member

This comment has been minimized.

Copy link
@ifed01

ifed01 May 16, 2016

Contributor

I doubt this sounds less complex....

This comment has been minimized.

Copy link
@liewegas

liewegas May 16, 2016

Author Member

Yeah, I just mean we should avoid the case where length != bl.length() unless we have a really good reason.


bufferlist::iterator p = bl.begin();
if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
(length != min_alloc_size)) {
Expand Down Expand Up @@ -5678,6 +5696,8 @@ int BlueStore::_do_zero(TransContext *txc,
o->clear_tail();
}

o->bc.discard(offset, length);

WriteContext wctx;
o->onode.punch_hole(offset, length, &wctx.lex_old);
_wctx_finish(txc, c, o, &wctx);
Expand Down Expand Up @@ -5706,6 +5726,8 @@ int BlueStore::_do_truncate(
// they may touch.
o->flush();

o->bc.truncate(offset);

WriteContext wctx;
o->onode.punch_hole(offset, o->onode.size, &wctx.lex_old);
_wctx_finish(txc, c, o, &wctx);
Expand Down
195 changes: 195 additions & 0 deletions src/os/bluestore/BlueStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,199 @@ class BlueStore : public ObjectStore,

class TransContext;

/// cached buffer
struct Buffer {
enum {
STATE_UNDEF = 0,
STATE_CLEAN,
STATE_WRITING,
STATE_READING,
};
static const char *get_state_name(int s) {
switch (s) {
case STATE_UNDEF: return "undef";
case STATE_CLEAN: return "clean";
case STATE_WRITING: return "writing";
case STATE_READING: return "reading";
default: return "???";
}
}

unsigned state; ///< STATE_*
uint64_t seq;
uint64_t offset, length;

This comment has been minimized.

Copy link
@ifed01

ifed01 May 16, 2016

Contributor

Looks like offset field duplicates the key in BufferSpace::buffer_map thus one can get rid off it here.

bufferlist data;

boost::intrusive::list_member_hook<> onode_lru_item;

Buffer(unsigned s, uint64_t q, uint64_t o, uint64_t l)
: state(s), seq(q), offset(o), length(l) {}
Buffer(unsigned s, uint64_t q, uint64_t o, bufferlist& b)
: state(s), seq(q), offset(o), length(b.length()), data(b) {}

bool is_clean() const {
return state == STATE_CLEAN;
}
bool is_writing() const {
return state == STATE_WRITING;
}
bool is_reading() const {
return state == STATE_READING;
}

uint64_t end() const {
return offset + length;
}

void truncate(uint64_t newlen) {
assert(newlen > length);
if (data.length()) {
bufferlist t;
t.substr_of(data, 0, newlen);
data.claim(t);
}
length = newlen;
}

void dump(Formatter *f) const {
f->dump_string("state", get_state_name(state));
f->dump_unsigned("seq", seq);
f->dump_unsigned("offset", offset);
f->dump_unsigned("length", length);
f->dump_unsigned("data_length", data.length());
}
};

struct BufferSpace {
typedef boost::intrusive::list<
Buffer,
boost::intrusive::member_hook<
Buffer,
boost::intrusive::list_member_hook<>,
&Buffer::onode_lru_item> > lru_list_t;

map<uint64_t,std::unique_ptr<Buffer>> buffer_map;
lru_list_t lru;
uint64_t size = 0;

void _add_buffer(Buffer *b) {
buffer_map[b->offset].reset(b);
lru.push_front(*b);
size += b->length;
}
void _rm_buffer(map<uint64_t,std::unique_ptr<Buffer>>::iterator p) {
size -= p->second->length;
lru.erase(lru.iterator_to(*p->second));
buffer_map.erase(p);
}

/// move to top of lru
void _touch_buffer(Buffer *b) {
lru_list_t::iterator p = lru.iterator_to(*b);
lru.erase(p);
lru.push_front(*b);
}

map<uint64_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
uint64_t offset) {
auto i = buffer_map.lower_bound(offset);
if (i != buffer_map.begin()) {
--i;
if (i->first + i->second->length <= offset)
++i;
}
return i;
}

bool empty() const {
return buffer_map.empty();
}

void discard(uint64_t offset, uint64_t length) {
auto i = _data_lower_bound(offset);
uint64_t end = offset + length;
while (i != buffer_map.end()) {
Buffer *b = i->second.get();
if (b->offset >= offset + length) {
break;
}
if (b->offset < offset) {
uint64_t drop = offset - b->offset;
b->truncate(b->length - drop);
size -= drop;
++i;
continue;
}
if (b->end() <= end) {
_rm_buffer(i++);
continue;
}
uint64_t keep = b->end() - end;
size -= b->length - keep;
if (b->data.length()) {
bufferlist bl;
bl.substr_of(b->data, b->length - keep, keep);
_add_buffer(new Buffer(b->state, b->seq, end, bl));
_rm_buffer(i);
} else {
_add_buffer(new Buffer(b->state, b->seq, end, keep));
_rm_buffer(i);
}
break;
}
}

void write(uint64_t seq, uint64_t offset, bufferlist& bl) {
discard(offset, bl.length());
_add_buffer(new Buffer(Buffer::STATE_WRITING, seq, offset, bl));
}
void finish_write(uint64_t seq) {
// fixme: be more efficient... intrusive_list just for writing, perhaps?

This comment has been minimized.

Copy link
@yuyuyu101

yuyuyu101 May 16, 2016

Member

To notes, if we support async read later, we need to link reading buffer too..

for (auto i = buffer_map.begin(); i != buffer_map.end(); ++i) {
if (i->second->is_writing() &&
i->second->seq <= seq) {
i->second->state = Buffer::STATE_CLEAN;
}
}
}

void truncate(uint64_t offset) {
discard(offset, (uint64_t)-1 - offset);
}

void trim(uint64_t keep) {
lru_list_t::iterator i = lru.end();
while (size > keep) {
Buffer *b = &*i;
if (b->is_clean()) {
auto p = buffer_map.find(b->offset);
if (i != lru.begin())
++i;
_rm_buffer(p);
} else {
if (i != lru.begin()) {
++i;
continue;
} else {
break;
}
}
}
}

void dump(Formatter *f) const {
f->dump_unsigned("size", size);
f->open_array_section("buffers");
for (auto& i : buffer_map) {
f->open_object_section("buffer");
assert(i.first == i.second->offset);
i.second->dump(f);
f->close_section();
}
f->close_section();
}
};

/// an in-memory extent-map, shared by a group of objects (w/ same hash value)
struct BnodeSet;

Expand Down Expand Up @@ -155,6 +348,8 @@ class BlueStore : public ObjectStore,
std::condition_variable flush_cond; ///< wait here for unapplied txns
set<TransContext*> flush_txns; ///< committing or wal txns

BufferSpace bc;

uint64_t tail_offset = 0;
uint64_t tail_txc_seq = 0;
bufferlist tail_bl;
Expand Down

0 comments on commit b9ac31a

Please sign in to comment.