Skip to content

Commit

Permalink
os/bluestore: limit OSD memory usage by tuning the cache size.
Browse files Browse the repository at this point in the history
Signed-off-by: Mark Nelson <mnelson@redhat.com>
(cherry picked from commit 1b8a87b)

Conflicts:
	src/common/options.cc
	src/os/bluestore/BlueStore.cc
trivial - no g_conf operator. or option TYPE_SIZE
  • Loading branch information
Mark Nelson authored and jdurgin committed Sep 12, 2018
1 parent edb5441 commit 802ea6c
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 58 deletions.
48 changes: 38 additions & 10 deletions src/common/options.cc
Expand Up @@ -3146,6 +3146,32 @@ std::vector<Option> get_global_options() {
.set_default(true)
.set_description(""),


Option("osd_memory_target", Option::TYPE_UINT, Option::LEVEL_BASIC)
.set_default(4_G)
.add_see_also("bluestore_cache_autotune")
.set_description("When tcmalloc and cache autotuning is enabled, try to keep this many bytes mapped in memory."),

Option("osd_memory_base", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(768_M)
.add_see_also("bluestore_cache_autotune")
.set_description("When tcmalloc and cache autotuning is enabled, estimate the minimum amount of memory in bytes the OSD will need."),

Option("osd_memory_expected_fragmentation", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(0.15)
.add_see_also("bluestore_cache_autotune")
.set_description("When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation."),

Option("osd_memory_cache_min", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(128_M)
.add_see_also("bluestore_cache_autotune")
.set_description("When tcmalloc and cache autotuning is enabled, set the minimum amount of memory used for caches."),

Option("osd_memory_cache_resize_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(1)
.add_see_also("bluestore_cache_autotune")
.set_description("When tcmalloc and cache autotuning is enabled, wait this many seconds between resizing caches."),

Option("memstore_device_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(1_G)
.set_description(""),
Expand Down Expand Up @@ -3543,39 +3569,41 @@ std::vector<Option> get_global_options() {
.set_default(.5)
.set_description("2Q paper suggests .5"),

Option("bluestore_cache_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("bluestore_cache_size", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(0)
.set_description("Cache size (in bytes) for BlueStore")
.set_long_description("This includes data and metadata cached by BlueStore as well as memory devoted to rocksdb's cache(s)."),

Option("bluestore_cache_size_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("bluestore_cache_size_hdd", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(1_G)
.set_description("Default bluestore_cache_size for rotational media"),

Option("bluestore_cache_size_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("bluestore_cache_size_ssd", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(3_G)
.set_description("Default bluestore_cache_size for non-rotational (solid state) media"),

Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.01)
Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.4)
.add_see_also("bluestore_cache_size")
.set_description("Ratio of bluestore cache to devote to metadata"),

Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.99)
Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(.4)
.add_see_also("bluestore_cache_size")
.set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),

Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(true)
.add_see_also("bluestore_cache_size")
.add_see_also("bluestore_cache_meta_ratio")
.set_description("Automatically tune the ratio of caches while respecting min values."),

Option("bluestore_cache_autotune_chunk_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
Option("bluestore_cache_autotune_chunk_size", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(33554432)
.add_see_also("bluestore_cache_autotune")
.set_description("The chunk size in bytes to allocate to caches when cache autotune is enabled."),

Option("bluestore_cache_autotune_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
Option("bluestore_cache_autotune_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(5)
.add_see_also("bluestore_cache_autotune")
.set_description("The number of seconds to wait between rebalances when cache autotune is enabled."),
Expand Down
5 changes: 4 additions & 1 deletion src/os/CMakeLists.txt
Expand Up @@ -69,7 +69,10 @@ if(WITH_SPDK)
add_compile_options(-mcrc32 -msse3 -mssse3 -msse4.1 -msse4.2)
endif()

add_library(os STATIC ${libos_srcs} $<TARGET_OBJECTS:kv_objs>)
add_library(os STATIC ${libos_srcs}
$<TARGET_OBJECTS:kv_objs> $<TARGET_OBJECTS:heap_profiler_objs>)



if(HAVE_LIBAIO)
target_link_libraries(os ${AIO_LIBRARIES})
Expand Down
166 changes: 125 additions & 41 deletions src/os/bluestore/BlueStore.cc
Expand Up @@ -33,6 +33,7 @@
#include "BlueRocksEnv.h"
#include "auth/Crypto.h"
#include "common/EventTrace.h"
#include "perfglue/heap_profiler.h"

#define dout_context cct
#define dout_subsys ceph_subsys_bluestore
Expand Down Expand Up @@ -3293,26 +3294,45 @@ void *BlueStore::MempoolThread::entry()
caches.push_back(store->db);
caches.push_back(&meta_cache);
caches.push_back(&data_cache);
autotune_cache_size = store->osd_memory_cache_min;

utime_t next_balance = ceph_clock_now();
utime_t next_resize = ceph_clock_now();

bool interval_stats_trim = false;
bool interval_stats_resize = false;
while (!stop) {
_adjust_cache_settings();

// Before we trim, check and see if it's time to rebalance
bool log_stats = false;
// Before we trim, check and see if it's time to rebalance/resize.
double autotune_interval = store->cache_autotune_interval;
double resize_interval = store->osd_memory_cache_resize_interval;

if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
// Log events at 5 instead of 20 when balance happens.
interval_stats_resize = true;
interval_stats_trim = true;
if (store->cache_autotune) {
_balance_cache(caches);
}

next_balance = ceph_clock_now();
next_balance += autotune_interval;
log_stats = true;
}
if (resize_interval > 0 && next_resize < ceph_clock_now()) {
if (ceph_using_tcmalloc() && store->cache_autotune) {
_tune_cache_size(interval_stats_resize);
interval_stats_resize = false;
}
next_resize = ceph_clock_now();
next_resize += resize_interval;
}

_trim_shards(log_stats);
store->_update_cache_logger();
// Now Trim
_trim_shards(interval_stats_trim);
interval_stats_trim = false;

store->_update_cache_logger();
utime_t wait;
wait += store->cct->_conf->bluestore_cache_trim_interval;
cond.WaitInterval(lock, wait);
Expand All @@ -3328,59 +3348,116 @@ void BlueStore::MempoolThread::_adjust_cache_settings()
data_cache.set_cache_ratio(store->cache_data_ratio);
}

void BlueStore::MempoolThread::_trim_shards(bool log_stats)
void BlueStore::MempoolThread::_trim_shards(bool interval_stats)
{
uint64_t cache_size = store->cache_size;
auto cct = store->cct;
size_t num_shards = store->cache_shards.size();
int64_t kv_alloc_bytes = 0;
int64_t meta_alloc_bytes = 0;
int64_t data_alloc_bytes = 0;

int64_t kv_used = store->db->get_cache_usage();
int64_t meta_used = meta_cache._get_used_bytes();
int64_t data_used = data_cache._get_used_bytes();

uint64_t cache_size = store->cache_size;
int64_t kv_alloc =
static_cast<int64_t>(store->db->get_cache_ratio() * cache_size);
int64_t meta_alloc =
static_cast<int64_t>(meta_cache.get_cache_ratio() * cache_size);
int64_t data_alloc =
static_cast<int64_t>(data_cache.get_cache_ratio() * cache_size);

if (store->cache_autotune) {
kv_alloc_bytes = store->db->get_cache_bytes();
meta_alloc_bytes = meta_cache.get_cache_bytes();
data_alloc_bytes = data_cache.get_cache_bytes();
cache_size = autotune_cache_size;

kv_alloc = store->db->get_cache_bytes();
meta_alloc = meta_cache.get_cache_bytes();
data_alloc = data_cache.get_cache_bytes();
}

if (interval_stats) {
ldout(cct, 5) << __func__ << " cache_size: " << cache_size
<< " kv_alloc: " << kv_alloc
<< " kv_used: " << kv_used
<< " meta_alloc: " << meta_alloc
<< " meta_used: " << meta_used
<< " data_alloc: " << data_alloc
<< " data_used: " << data_used << dendl;
} else {
kv_alloc_bytes = static_cast<int64_t>(
store->db->get_cache_ratio() * cache_size);
meta_alloc_bytes = static_cast<int64_t>(
meta_cache.get_cache_ratio() * cache_size);
data_alloc_bytes = static_cast<int64_t>(
data_cache.get_cache_ratio() * cache_size);
}
if (log_stats) {
double kv_alloc_ratio = (double) kv_alloc_bytes / cache_size;
double meta_alloc_ratio = (double) meta_alloc_bytes / cache_size;
double data_alloc_ratio = (double) data_alloc_bytes / cache_size;
double kv_used_ratio = (double) store->db->get_cache_usage() / cache_size;
double meta_used_ratio = (double) meta_cache._get_used_bytes() / cache_size;
double data_used_ratio = (double) data_cache._get_used_bytes() / cache_size;

ldout(store->cct, 5) << __func__ << " ratios -" << std::fixed << std::setprecision(1)
<< " kv_alloc: " << 100*kv_alloc_ratio << "%"
<< " kv_used: " << 100*kv_used_ratio << "%"
<< " meta_alloc: " << 100*meta_alloc_ratio << "%"
<< " meta_used: " << 100*meta_used_ratio << "%"
<< " data_alloc: " << 100*data_alloc_ratio << "%"
<< " data_used: " << 100*data_used_ratio << "%" << dendl;
ldout(cct, 20) << __func__ << " cache_size: " << cache_size
<< " kv_alloc: " << kv_alloc
<< " kv_used: " << kv_used
<< " meta_alloc: " << meta_alloc
<< " meta_used: " << meta_used
<< " data_alloc: " << data_alloc
<< " data_used: " << data_used << dendl;
}

uint64_t max_shard_onodes = static_cast<uint64_t>(
(meta_alloc_bytes / (double) num_shards) / meta_cache.get_bytes_per_onode());
uint64_t max_shard_buffer = static_cast<uint64_t>(
data_alloc_bytes / num_shards);
ldout(store->cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
<< " max_shard_buffer: " << max_shard_buffer << dendl;
(meta_alloc / (double) num_shards) / meta_cache.get_bytes_per_onode());
uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);

ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
<< " max_shard_buffer: " << max_shard_buffer << dendl;

for (auto i : store->cache_shards) {
i->trim(max_shard_onodes, max_shard_buffer);
}
}

void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
{
auto cct = store->cct;
uint64_t target = store->osd_memory_target;
uint64_t base = store->osd_memory_base;
double fragmentation = store->osd_memory_expected_fragmentation;
uint64_t cache_max = ((1.0 - fragmentation) * target) - base;
uint64_t cache_min = store->osd_memory_cache_min;

size_t heap_size = 0;
size_t unmapped = 0;
uint64_t mapped = 0;

ceph_heap_release_free_memory();
ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
mapped = heap_size - unmapped;

uint64_t new_size = autotune_cache_size;
new_size = (new_size < cache_max) ? new_size : cache_max;
new_size = (new_size > cache_min) ? new_size : cache_min;

// Approach the min/max slowly, but bounce away quickly.
if ((uint64_t) mapped < target) {
double ratio = 1 - ((double) mapped / target);
new_size += ratio * (cache_max - new_size);
} else {
double ratio = 1 - ((double) target / mapped);
new_size -= ratio * (new_size - cache_min);
}

if (interval_stats) {
ldout(cct, 5) << __func__
<< " target: " << target
<< " heap: " << heap_size
<< " unmapped: " << unmapped
<< " mapped: " << mapped
<< " old cache_size: " << autotune_cache_size
<< " new cache size: " << new_size << dendl;
} else {
ldout(cct, 20) << __func__
<< " target: " << target
<< " heap: " << heap_size
<< " unmapped: " << unmapped
<< " mapped: " << mapped
<< " old cache_size: " << autotune_cache_size
<< " new cache size: " << new_size << dendl;
}
autotune_cache_size = new_size;
}

void BlueStore::MempoolThread::_balance_cache(
const std::list<PriorityCache::PriCache *>& caches)
{
int64_t mem_avail = store->cache_size;
int64_t mem_avail = autotune_cache_size;

// Assign memory for each priority level
for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
Expand Down Expand Up @@ -3860,6 +3937,13 @@ int BlueStore::_set_cache_sizes()
cct->_conf->get_val<uint64_t>("bluestore_cache_autotune_chunk_size");
cache_autotune_interval =
cct->_conf->get_val<double>("bluestore_cache_autotune_interval");
osd_memory_target = cct->_conf->get_val<uint64_t>("osd_memory_target");
osd_memory_base = cct->_conf->get_val<uint64_t>("osd_memory_base");
osd_memory_expected_fragmentation =
cct->_conf->get_val<double>("osd_memory_expected_fragmentation");
osd_memory_cache_min = cct->_conf->get_val<uint64_t>("osd_memory_cache_min");
osd_memory_cache_resize_interval =
cct->_conf->get_val<double>("osd_memory_cache_resize_interval");

if (cct->_conf->bluestore_cache_size) {
cache_size = cct->_conf->bluestore_cache_size;
Expand Down
15 changes: 9 additions & 6 deletions src/os/bluestore/BlueStore.h
Expand Up @@ -1921,17 +1921,18 @@ class BlueStore : public ObjectStore,
uint64_t kv_throttle_costs = 0;

// cache trim control
uint64_t cache_size = 0; ///< total cache size
uint64_t cache_size = 0; ///< total cache size
double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
double cache_data_ratio = 0; ///< cache ratio dedicated to object data
uint64_t cache_meta_min = 0; ///< cache min dedicated to metadata
uint64_t cache_kv_min = 0; ///< cache min dedicated to kv (e.g., rocksdb)
uint64_t cache_data_min = 0; ///< cache min dedicated to object data
bool cache_autotune = false; ///< cache autotune setting
uint64_t cache_autotune_chunk_size = 0; ///< cache autotune chunk size
double cache_autotune_interval = 0; ///< time to wait between cache rebalancing

uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cahce
double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
std::mutex vstatfs_lock;
volatile_statfs vstatfs;

Expand All @@ -1942,6 +1943,7 @@ class BlueStore : public ObjectStore,
Cond cond;
Mutex lock;
bool stop = false;
uint64_t autotune_cache_size = 0;

struct MempoolCache : public PriorityCache::PriCache {
BlueStore *store;
Expand Down Expand Up @@ -2060,7 +2062,8 @@ class BlueStore : public ObjectStore,

private:
void _adjust_cache_settings();
void _trim_shards(bool log_stats);
void _trim_shards(bool interval_stats);
void _tune_cache_size(bool interval_stats);
void _balance_cache(const std::list<PriorityCache::PriCache *>& caches);
void _balance_cache_pri(int64_t *mem_avail,
const std::list<PriorityCache::PriCache *>& caches,
Expand Down

0 comments on commit 802ea6c

Please sign in to comment.