Skip to content

Commit

Permalink
osd: Modify OSD Fast-Shutdown to work safely
Browse files Browse the repository at this point in the history
quiesce all activities and destage allocations to disk before killing the OSD

    1) keep the old (unsafe) fast-shutdown when we are not using NCB (non null-manager())
    2) skip service.prepare_to_stop() which can take as much as 10 seconds
    3) skip debug options in fast-shutdown
    4) set_state(STATE_STOPPING) which will stop accepting new tasks to this OSD
    5) clear op_shardedwq queues, this is safe since we didn't started processing them
    6) stop timer
    7) drain osd_op_tp (no new items will be added)
    8) now we can safely call umount which will close_db/bluefs and will destage allocation to disk
    9) skip _shutdown_cache() when we are in the middle of a fast-shutdown
    10) increase debug level on fast-shutdown
    11) add option for bluestore_qfsck_on_mount to force scan on mount for all tests
    12) disable fsck-on-umount when running fast-shutdown
    13) add an option to increase debug level at fast-shutdown umount()
    14) set a time limit to fast-shutdown

    15) Bug-Fix BlueStore::pool_statfs don't access db after it was removed
    16) Fix error message for qfsck (error was caused by PR #44563)

    17) make shutdown-timeout configurable

Fixes: https://tracker.ceph.com/issues/53266
Signed-off-by: Gabriel Benhanokh <gbenhano@redhat.com>
  • Loading branch information
benhanokh committed Mar 7, 2022
1 parent c0c05ef commit 9b2a64a
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 28 deletions.
13 changes: 13 additions & 0 deletions src/common/options/global.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -3266,6 +3266,13 @@ options:
slow shutdown is primarilyy useful for doing memory leak checking with valgrind.
default: true
with_legacy: true
- name: osd_fast_shutdown_timeout
type: int
level: advanced
desc: timeout in seconds for osd fast-shutdown (0 is unlimited)
default: 15
with_legacy: true
min: 0
- name: osd_fast_shutdown_notify_mon
type: bool
level: advanced
Expand Down Expand Up @@ -4937,6 +4944,12 @@ options:
This setting is used only when OSD is doing ``--mkfs``.
Next runs of OSD retrieve sharding from disk.
default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P
- name: bluestore_qfsck_on_mount
type: bool
level: dev
desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state
default: true
with_legacy: true
- name: bluestore_fsck_on_mount
type: bool
level: dev
Expand Down
3 changes: 2 additions & 1 deletion src/os/ObjectStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ class ObjectStore {
virtual bool needs_journal() = 0; //< requires a journal
virtual bool wants_journal() = 0; //< prefers a journal
virtual bool allows_journal() = 0; //< allows a journal

virtual void prepare_for_fast_shutdown() {}
virtual bool has_null_manager() { return false; }
// return store min allocation size, if applicable
virtual uint64_t get_min_alloc_size() const {
return 0;
Expand Down
43 changes: 25 additions & 18 deletions src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7565,9 +7565,16 @@ void BlueStore::set_cache_shards(unsigned num)
}
}

//---------------------------------------------
bool BlueStore::has_null_manager()
{
return (fm && fm->is_null_manager());
}

int BlueStore::_mount()
{
dout(5) << __func__ << "NCB:: path " << path << dendl;

_kv_only = false;
if (cct->_conf->bluestore_fsck_on_mount) {
dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
Expand Down Expand Up @@ -7681,12 +7688,15 @@ int BlueStore::umount()
#endif
dout(20) << __func__ << " stopping kv thread" << dendl;
_kv_stop();
_shutdown_cache();
// skip cache cleanup step on fast shutdown
if (likely(!m_fast_shutdown)) {
_shutdown_cache();
}
dout(20) << __func__ << " closing" << dendl;
}

_close_db_and_around();
if (cct->_conf->bluestore_fsck_on_umount) {
// disable fsck on fast-shutdown
if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
if (rc < 0)
return rc;
Expand Down Expand Up @@ -10305,6 +10315,11 @@ int BlueStore::get_numa_node(
return 0;
}

void BlueStore::prepare_for_fast_shutdown()
{
m_fast_shutdown = true;
}

int BlueStore::get_devices(set<string> *ls)
{
if (bdev) {
Expand Down Expand Up @@ -10432,7 +10447,8 @@ int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
string key_prefix;
_key_encode_u64(pool_id, &key_prefix);
*out_per_pool_omap = per_pool_omap != OMAP_BULK;
if (*out_per_pool_omap) {
// stop calls after db was closed
if (*out_per_pool_omap && db) {
auto prefix = per_pool_omap == OMAP_PER_POOL ?
PREFIX_PERPOOL_OMAP :
PREFIX_PERPG_OMAP;
Expand Down Expand Up @@ -19025,15 +19041,6 @@ int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t
return 0;
} else {
derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
std::cout << "===================================================================" << std::endl;
for (uint64_t i = 0; i < idx1; i++) {
std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl;
}

std::cout << "===================================================================" << std::endl;
for (uint64_t i = 0; i < idx2; i++) {
std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl;
}
return -1;
}
}
Expand Down Expand Up @@ -19081,9 +19088,9 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
utime_t start = ceph_clock_now();

auto shutdown_cache = make_scope_guard([&] {
std::cout << "Allocation Recovery was completed in " << duration
<< " seconds; insert_count=" << stats.insert_count
<< "; extent_count=" << stats.extent_count << std::endl;
dout(1) << "Allocation Recovery was completed in " << duration
<< " seconds; insert_count=" << stats.insert_count
<< "; extent_count=" << stats.extent_count << dendl;
_shutdown_cache();
_close_db_and_around();
});
Expand Down Expand Up @@ -19113,14 +19120,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
};
allocator->dump(count_entries);
ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
if (ret != 0) {
if (ret == 0) {
dout(5) << "Allocator drive - file integrity check OK" << dendl;
} else {
derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
}
}

std::cout << stats << std::endl;
dout(1) << stats << dendl;
return ret;
}

Expand Down
5 changes: 4 additions & 1 deletion src/os/bluestore/BlueStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -2764,7 +2764,7 @@ class BlueStore : public ObjectStore,

private:
int32_t ondisk_format = 0; ///< value detected on mount

bool m_fast_shutdown = false;
int _upgrade_super(); ///< upgrade (called during open_super)
uint64_t _get_ondisk_reserved() const;
void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
Expand All @@ -2783,6 +2783,9 @@ class BlueStore : public ObjectStore,
bool wants_journal() override { return false; };
bool allows_journal() override { return false; };

void prepare_for_fast_shutdown() override;
virtual bool has_null_manager();

uint64_t get_min_alloc_size() const override {
return min_alloc_size;
}
Expand Down
101 changes: 94 additions & 7 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4258,27 +4258,44 @@ PerfCounters* OSD::create_recoverystate_perf()

int OSD::shutdown()
{
// vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
//cct->_conf->osd_fast_shutdown = true;

dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
<< cct->_conf->osd_fast_shutdown
<< ", null-fm = " << store->has_null_manager() << dendl;

utime_t start_time_func = ceph_clock_now();

if (cct->_conf->osd_fast_shutdown) {
derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
if (cct->_conf->osd_fast_shutdown_notify_mon)
service.prepare_to_stop();
cct->_log->flush();
_exit(0);
}

if (!service.prepare_to_stop())
// There is no state we need to keep wehn running in NULL-FM moode
if (!store->has_null_manager()) {
cct->_log->flush();
_exit(0);
}
} else if (!service.prepare_to_stop()) {
return 0; // already shutting down
}

osd_lock.lock();
if (is_stopping()) {
osd_lock.unlock();
return 0;
}
dout(0) << "shutdown" << dendl;

if (!cct->_conf->osd_fast_shutdown) {
dout(0) << "shutdown" << dendl;
}

// don't accept new task for this OSD
set_state(STATE_STOPPING);

// Debugging
if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
// Disabled debugging during fast-shutdown
if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
cct->_conf.set_val("debug_osd", "100");
cct->_conf.set_val("debug_journal", "100");
cct->_conf.set_val("debug_filestore", "100");
Expand All @@ -4287,6 +4304,45 @@ int OSD::shutdown()
cct->_conf.apply_changes(nullptr);
}

if (cct->_conf->osd_fast_shutdown) {
// first, stop new task from being taken from op_shardedwq
// and clear all pending tasks
op_shardedwq.stop_for_fast_shutdown();

utime_t start_time_timer = ceph_clock_now();
tick_timer.shutdown();
{
std::lock_guard l(tick_timer_lock);
tick_timer_without_osd_lock.shutdown();
}

osd_lock.unlock();
utime_t start_time_osd_drain = ceph_clock_now();

// then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
osd_op_tp.drain();
osd_op_tp.stop();

utime_t start_time_umount = ceph_clock_now();
store->prepare_for_fast_shutdown();
std::lock_guard lock(osd_lock);
// TBD: assert in allocator that nothing is being add
store->umount();

utime_t end_time = ceph_clock_now();
if (cct->_conf->osd_fast_shutdown_timeout) {
ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
}
dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
cct->_log->flush();

// now it is safe to exit
_exit(0);
}

// stop MgrClient earlier as it's more like an internal consumer of OSD
mgrc.shutdown();

Expand Down Expand Up @@ -4448,6 +4504,9 @@ int OSD::shutdown()
hb_front_server_messenger->shutdown();
hb_back_server_messenger->shutdown();

utime_t duration = ceph_clock_now() - start_time_func;
dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;

tracing::osd::tracer.shutdown();

return r;
Expand Down Expand Up @@ -11072,6 +11131,11 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
}

void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
if (unlikely(m_fast_shutdown) ) {
// stop enqueing when we are in the middle of a fast shutdown
return;
}

uint32_t shard_index =
item.get_ordering_token().hash_to_shard(osd->shards.size());

Expand Down Expand Up @@ -11102,6 +11166,11 @@ void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {

void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
{
if (unlikely(m_fast_shutdown) ) {
// stop enqueing when we are in the middle of a fast shutdown
return;
}

auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
auto& sdata = osd->shards[shard_index];
ceph_assert(sdata);
Expand All @@ -11128,6 +11197,24 @@ void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
sdata->sdata_cond.notify_one();
}

void OSD::ShardedOpWQ::stop_for_fast_shutdown()
{
uint32_t shard_index = 0;
m_fast_shutdown = true;

for (; shard_index < osd->num_shards; shard_index++) {
auto& sdata = osd->shards[shard_index];
ceph_assert(sdata);
sdata->shard_lock.lock();
int work_count = 0;
while(! sdata->scheduler->empty() ) {
auto work_item = sdata->scheduler->dequeue();
work_count++;
}
sdata->shard_lock.unlock();
}
}

namespace ceph::osd_cmds {

int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
Expand Down
4 changes: 3 additions & 1 deletion src/osd/OSD.h
Original file line number Diff line number Diff line change
Expand Up @@ -1592,7 +1592,7 @@ class OSD : public Dispatcher,
: public ShardedThreadPool::ShardedWQ<OpSchedulerItem>
{
OSD *osd;

bool m_fast_shutdown = false;
public:
ShardedOpWQ(OSD *o,
ceph::timespan ti,
Expand All @@ -1610,6 +1610,8 @@ class OSD : public Dispatcher,
/// try to do some work
void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override;

void stop_for_fast_shutdown();

/// enqueue a new item
void _enqueue(OpSchedulerItem&& item) override;

Expand Down

0 comments on commit 9b2a64a

Please sign in to comment.