Skip to content

Commit

Permalink
os/bluestore: introduce offline DB/WAL volume migration for
Browse files Browse the repository at this point in the history
ceph-bluestore-tool.

Signed-off-by: Igor Fedotov <ifedotov@suse.com>
  • Loading branch information
ifed01 committed Jul 18, 2018
1 parent 890ce04 commit a729935
Show file tree
Hide file tree
Showing 4 changed files with 401 additions and 40 deletions.
233 changes: 219 additions & 14 deletions src/os/bluestore/BlueFS.cc
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ int BlueFS::mkfs(uuid_d osd_uuid)

// write supers
super.log_fnode = log_file->fnode;
_write_super();
_write_super(BDEV_DB);
flush_bdev();

// clean up
Expand Down Expand Up @@ -514,7 +514,7 @@ int BlueFS::fsck()
return 0;
}

int BlueFS::_write_super()
int BlueFS::_write_super(int dev)
{
// build superblock
bufferlist bl;
Expand All @@ -527,7 +527,7 @@ int BlueFS::_write_super()
assert(bl.length() <= get_super_length());
bl.append_zero(get_super_length() - bl.length());

bdev[BDEV_DB]->write(get_super_offset(), bl, false);
bdev[dev]->write(get_super_offset(), bl, false);
dout(20) << __func__ << " v " << super.version
<< " crc 0x" << std::hex << crc
<< " offset 0x" << get_super_offset() << std::dec
Expand Down Expand Up @@ -970,6 +970,132 @@ int BlueFS::log_dump()
return 0;
}

int BlueFS::device_migrate_to_existing(
CephContext *cct,
const set<int>& devs_source,
int dev_target)
{
vector<byte> buf;
bool buffered = cct->_conf->bluefs_buffered_io;

assert(dev_target < (int)MAX_BDEV);

bool no_resulting_db = false;
int dev_target_new = dev_target;

// Slow device without separate DB one is addressed via BDEV_DB
// Hence need renaming.
if (devs_source.count(BDEV_DB) && dev_target == BDEV_SLOW) {
dev_target_new = BDEV_DB;
no_resulting_db = true;
dout(0) << __func__ << " super to be written to " << dev_target << dendl;
}

for (auto& p : file_map) {
//do not copy log
if (p.second->fnode.ino == 1) {
continue;
}
auto& fnode_extents = p.second->fnode.extents;

for (auto ext_it = fnode_extents.begin();
ext_it != p.second->fnode.extents.end();
++ext_it) {
if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
bluefs_extent_t old_ext = *ext_it;
PExtentVector extents;
auto l =
_allocate_without_fallback(dev_target, old_ext.length, &extents);
if (l == 0) {
buf.resize(old_ext.length);
int r = bdev[old_ext.bdev]->read_random(
old_ext.offset,
old_ext.length,
(char*)&buf.at(0),
buffered);
dout(0)<<__func__<<" read = "<<r<<dendl;
if (r != 0) {
derr << __func__ << " failed to read 0x" << std::hex
<< old_ext.offset << "~" <<old_ext.length << std::dec
<< " from " << (int)dev_target << dendl;
return -EIO;
}

assert(extents.size() > 0);
uint64_t src_buf_pos = 0;
{
// overwrite existing extent
*ext_it=
bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length);
bufferlist bl;
bl.append((char*)&buf.at(src_buf_pos), extents[0].length);
dout(0)<<__func__<<" writing"<<dendl;
int r = bdev[dev_target]->write(extents[0].offset, bl, buffered);
dout(0)<<__func__<<" write = "<<r<<dendl;
assert(r == 0);
src_buf_pos += extents[0].length;
}
// then insert more extents if needed
for( size_t i = 1; i < extents.size(); ++i) {
bufferlist bl;
bl.append((char*)&buf.at(src_buf_pos), extents[i].length);
++ext_it;
ext_it = fnode_extents.emplace(ext_it, dev_target_new,
extents[i].offset, extents[i].length);
dout(0)<<__func__<<" writing"<<dendl;
int r = bdev[dev_target]->write(extents[i].offset, bl, buffered);
dout(0)<<__func__<<" write = "<<r<<dendl;
assert(r == 0);
src_buf_pos += extents[i].length;
}
{
PExtentVector to_release;
to_release.emplace_back(old_ext.offset, old_ext.length);
alloc[old_ext.bdev]->release(to_release);
}

} else {
derr << __func__ << " unable to allocate len 0x" << std::hex
<< old_ext.length << std::dec << " from " << (int)dev_target
<< dendl;
return -ENOSPC;
}
} else if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
ext_it->bdev = dev_target_new;
}
}
auto& prefer_bdev = p.second->fnode.prefer_bdev;
if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
prefer_bdev = dev_target_new;
}
}
auto& log_fnode = log_writer->file->fnode;
int log_dev_cur = bdev[BDEV_WAL] ? BDEV_WAL :
bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
if (devs_source.count(log_dev_cur)) {
dout(0) << __func__ << " log moved from " << log_dev_cur
<< " to " << dev_target << dendl;
log_dev_cur = dev_target;
log_fnode.prefer_bdev = dev_target_new;
}

_rewrite_log_sync(true,
log_dev_cur,
log_fnode.prefer_bdev,
no_resulting_db,
devs_source.count(BDEV_WAL) != 0);
return 0;
}

int BlueFS::device_migrate_to_new(
CephContext *cct,
const set<int>& devs_source,
const string& dev_target)
{
return -1;
}


BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
{
auto p = file_map.find(ino);
Expand Down Expand Up @@ -1202,7 +1328,9 @@ bool BlueFS::_should_compact_log()
return true;
}

void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
bool no_resulting_db,
bool no_resulting_wal)
{
t->seq = 1;
t->uuid = super.uuid;
Expand All @@ -1212,10 +1340,20 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
interval_set<uint64_t>& p = block_all[bdev];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
auto bdev_new = bdev;
if (no_resulting_wal && bdev == BDEV_WAL) {
continue;
}
if (no_resulting_db && bdev == BDEV_DB) {
continue;
}
if (no_resulting_db && bdev == BDEV_SLOW) {
bdev_new = BDEV_DB;
}
dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
<< std::hex << q.get_start() << "~" << q.get_len() << std::dec
<< dendl;
t->op_alloc_add(bdev, q.get_start(), q.get_len());
t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
}
}
for (auto& p : file_map) {
Expand All @@ -1239,13 +1377,27 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
void BlueFS::_compact_log_sync()
{
dout(10) << __func__ << dendl;
_rewrite_log_sync(false,
log_writer->file->fnode.prefer_bdev,
log_writer->file->fnode.prefer_bdev,
false,
false);
logger->inc(l_bluefs_log_compactions);
}

void BlueFS::_rewrite_log_sync(bool allocate_with_fallback,
int log_dev,
int new_log_dev,
bool no_resulting_db,
bool no_resulting_wal)
{
File *log_file = log_writer->file.get();

// clear out log (be careful who calls us!!!)
log_t.clear();

bluefs_transaction_t t;
_compact_log_dump_metadata(&t);
_compact_log_dump_metadata(&t, no_resulting_db, no_resulting_wal);

dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
t.op_jump_seq(log_seq);
Expand All @@ -1259,9 +1411,22 @@ void BlueFS::_compact_log_sync()

mempool::bluefs::vector<bluefs_extent_t> old_extents;
uint64_t old_allocated = 0;
int r;
log_file->fnode.swap_extents(old_extents, old_allocated);
int r = _allocate(log_file->fnode.prefer_bdev, need, &log_file->fnode);
assert(r == 0);
if (allocate_with_fallback) {
r = _allocate(log_dev, need, &log_file->fnode);
assert(r == 0);
} else {
PExtentVector extents;
r = _allocate_without_fallback(log_dev,
need,
&extents);
assert(r == 0);
for (auto& p : extents) {
log_file->fnode.append_extent(
bluefs_extent_t(log_dev, p.offset, p.length));
}
}

_close_writer(log_writer);

Expand All @@ -1280,18 +1445,24 @@ void BlueFS::_compact_log_sync()
#endif
flush_bdev();

dout(10) << __func__ << " writing super" << dendl;
super.log_fnode = log_file->fnode;
// rename device if needed
if (log_dev != new_log_dev) {
dout(10) << __func__ << " renaming log extents to " << new_log_dev << dendl;
for (auto& p : super.log_fnode.extents) {
p.bdev = new_log_dev;
}
}
dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;

++super.version;
_write_super();
_write_super(no_resulting_db ? BDEV_SLOW : BDEV_DB);
flush_bdev();

dout(10) << __func__ << " release old log extents " << old_extents << dendl;
for (auto& r : old_extents) {
pending_release[r.bdev].insert(r.offset, r.length);
}

logger->inc(l_bluefs_log_compactions);
}

/*
Expand Down Expand Up @@ -1358,7 +1529,7 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
bluefs_transaction_t t;
//avoid record two times in log_t and _compact_log_dump_metadata.
log_t.clear();
_compact_log_dump_metadata(&t);
_compact_log_dump_metadata(&t, false, false);

// conservative estimate for final encoded size
new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
Expand Down Expand Up @@ -1430,7 +1601,7 @@ void BlueFS::_compact_log_async(std::unique_lock<std::mutex>& l)
dout(10) << __func__ << " writing super" << dendl;
super.log_fnode = log_file->fnode;
++super.version;
_write_super();
_write_super(BDEV_DB);

lock.unlock();
flush_bdev();
Expand Down Expand Up @@ -1980,6 +2151,40 @@ void BlueFS::flush_bdev()
}
}

int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
PExtentVector* extents)
{
dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
<< " from " << (int)id << dendl;
assert(id < alloc.size());
uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;

uint64_t left = round_up_to(len, min_alloc_size);

if (!alloc[id]) {
return -ENOENT;
}
extents->reserve(4); // 4 should be (more than) enough for most allocations
int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
if (alloc_len < (int64_t)left) {
if (alloc_len != 0) {
alloc[id]->release(*extents);
}
if (bdev[id])
derr << __func__ << " failed to allocate 0x" << std::hex << left
<< " on bdev " << (int)id
<< ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
else
derr << __func__ << " failed to allocate 0x" << std::hex << left
<< " on bdev " << (int)id << ", dne" << std::dec << dendl;
if (alloc[id])
alloc[id]->dump();
return -ENOSPC;
}

return 0;
}

int BlueFS::_allocate(uint8_t id, uint64_t len,
bluefs_fnode_t* node)
{
Expand Down
24 changes: 22 additions & 2 deletions src/os/bluestore/BlueFS.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,9 @@ class BlueFS {

int _allocate(uint8_t bdev, uint64_t len,
bluefs_fnode_t* node);
int _allocate_without_fallback(uint8_t id, uint64_t len,
PExtentVector* extents);

int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
int _flush(FileWriter *h, bool force);
int _fsync(FileWriter *h, std::unique_lock<std::mutex>& l);
Expand All @@ -287,10 +290,18 @@ class BlueFS {
uint64_t jump_to = 0);
uint64_t _estimate_log_size();
bool _should_compact_log();
void _compact_log_dump_metadata(bluefs_transaction_t *t);
void _compact_log_dump_metadata(bluefs_transaction_t *t,
bool no_resulting_db,
bool no_resulting_wal);
void _compact_log_sync();
void _compact_log_async(std::unique_lock<std::mutex>& l);

void _rewrite_log_sync(bool allocate_with_fallback,
int log_dev,
int new_log_dev,
bool no_resulting_db,
bool no_resulting_wal);

//void _aio_finish(void *priv);

void _flush_bdev_safely(FileWriter *h);
Expand All @@ -316,7 +327,7 @@ class BlueFS {
void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);

int _open_super();
int _write_super();
int _write_super(int dev);
int _replay(bool noop, bool to_stdout = false); ///< replay journal

FileWriter *_create_writer(FileRef f);
Expand Down Expand Up @@ -346,6 +357,15 @@ class BlueFS {
void get_devices(set<string> *ls);
int fsck();

int device_migrate_to_new(
CephContext *cct,
const set<int>& devs_source,
const string& dev_target);
int device_migrate_to_existing(
CephContext *cct,
const set<int>& devs_source,
int dev_target);

uint64_t get_fs_usage();
uint64_t get_total(unsigned id);
uint64_t get_free(unsigned id);
Expand Down

0 comments on commit a729935

Please sign in to comment.