Skip to content

Commit

Permalink
Merge pull request #16134 from smithfarm/wip-19340-kraken
Browse files Browse the repository at this point in the history
kraken: An OSD was seen getting ENOSPC even with osd_failsafe_full_ratio passed

Reviewed-by: David Zafman <dzafman@redhat.com>
  • Loading branch information
smithfarm committed Jul 31, 2017
2 parents 938b723 + c6542ac commit b50909c
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 10 deletions.
2 changes: 1 addition & 1 deletion doc/rados/configuration/mon-osd-interaction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ Monitor Settings
mark Ceph OSD Daemons ``out``.

:Type: Double
:Default: ``.3``
:Default: ``.75``


``mon osd laggy halflife``
Expand Down
3 changes: 2 additions & 1 deletion src/common/config_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds '
OPTION(mon_osd_down_out_interval, OPT_INT, 600) // seconds
OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out
OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down
OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out
OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .75) // min osds required to be in to mark things out
OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32) // max op age before we get concerned (make it a power of 2)
OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be set in the osdmap
Expand Down Expand Up @@ -305,6 +305,7 @@ OPTION(mon_crush_min_required_version, OPT_STR, "firefly")
OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0
OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
OPTION(mon_warn_osd_usage_min_max_delta, OPT_FLOAT, .40) // warn if difference between min and max OSD utilizations exceeds specified amount
OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
OPTION(mon_max_log_epochs, OPT_INT, 500)
Expand Down
26 changes: 26 additions & 0 deletions src/mon/PGMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1796,6 +1796,32 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
}
}

if (g_conf->mon_warn_osd_usage_min_max_delta) {
float max_osd_usage = 0.0, min_osd_usage = 1.0;
for (auto p = pg_map.osd_stat.begin(); p != pg_map.osd_stat.end(); ++p) {
// kb should never be 0, but avoid divide by zero in case of corruption
if (p->second.kb <= 0)
continue;
float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
if (usage > max_osd_usage)
max_osd_usage = usage;
if (usage < min_osd_usage)
min_osd_usage = usage;
}
float diff = max_osd_usage - min_osd_usage;
if (diff > g_conf->mon_warn_osd_usage_min_max_delta) {
ostringstream ss;
ss << "difference between min (" << roundf(min_osd_usage*1000.0)/10.0
<< "%) and max (" << roundf(max_osd_usage*1000.0)/10.0
<< "%) osd usage " << roundf(diff*1000.0)/10.0 << "% > "
<< roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/10.0
<< "% (mon_warn_osd_usage_min_max_delta)";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (detail)
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}

// recovery
list<string> sl;
pg_map.overall_recovery_summary(NULL, &sl);
Expand Down
8 changes: 5 additions & 3 deletions src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4712,8 +4712,10 @@ int BlueStore::statfs(struct store_statfs_t *buf)
buf->available = alloc->get_free();

if (bluefs) {
// part of our shared device is "free" accordingly to BlueFS
buf->available += bluefs->get_free(bluefs_shared_bdev);
// part of our shared device is "free" according to BlueFS
// Don't include bluestore_bluefs_min because that space can't
// be used for any other purpose.
buf->available += bluefs->get_free(bluefs_shared_bdev) - cct->_conf->bluestore_bluefs_min;

// include dedicated db, too, if that isn't the shared device.
if (bluefs_shared_bdev != BlueFS::BDEV_DB) {
Expand Down Expand Up @@ -7334,7 +7336,7 @@ void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
if (r == -ENOSPC)
// For now, if we hit _any_ ENOSPC, crash, before we do any damage
// by partially applying transactions.
msg = "ENOSPC handling not implemented";
msg = "ENOSPC from bluestore, misconfigured cluster";

if (r == -ENOTEMPTY) {
msg = "ENOTEMPTY suggests garbage data in osd data dir";
Expand Down
12 changes: 12 additions & 0 deletions src/os/filestore/FileJournal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2157,3 +2157,15 @@ void FileJournal::corrupt_header_magic(
(reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
corrupt(wfd, corrupt_at);
}

off64_t FileJournal::get_journal_size_estimate()
{
off64_t size, start = header.start;
if (write_pos < start) {
size = (max_size - start) + write_pos;
} else {
size = write_pos - start;
}
dout(20) << __func__ << " journal size=" << size << dendl;
return size;
}
2 changes: 2 additions & 0 deletions src/os/filestore/FileJournal.h
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,8 @@ class FileJournal :

void set_wait_on_full(bool b) { wait_on_full = b; }

off64_t get_journal_size_estimate();

// reads

/// Result code for read_entry
Expand Down
10 changes: 9 additions & 1 deletion src/os/filestore/FileStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,14 @@ int FileStore::statfs(struct store_statfs_t *buf0)
}
buf0->total = buf.f_blocks * buf.f_bsize;
buf0->available = buf.f_bavail * buf.f_bsize;
// Adjust for writes pending in the journal
if (journal) {
uint64_t estimate = journal->get_journal_size_estimate();
if (buf0->available > estimate)
buf0->available -= estimate;
else
buf0->available = 0;
}
return 0;
}

Expand Down Expand Up @@ -2997,7 +3005,7 @@ void FileStore::_do_transaction(
} else if (r == -ENOSPC) {
// For now, if we hit _any_ ENOSPC, crash, before we do any damage
// by partially applying transactions.
msg = "ENOSPC handling not implemented";
msg = "ENOSPC from disk filesystem, misconfigured cluster";
} else if (r == -ENOTEMPTY) {
msg = "ENOTEMPTY suggests garbage data in osd data dir";
} else if (r == -EPERM) {
Expand Down
2 changes: 2 additions & 0 deletions src/os/filestore/Journal.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class Journal {

virtual int prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) = 0;

virtual off64_t get_journal_size_estimate() { return 0; }

// reads/recovery

};
Expand Down
2 changes: 1 addition & 1 deletion src/os/kstore/KStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2507,7 +2507,7 @@ void KStore::_txc_add_transaction(TransContext *txc, Transaction *t)
if (r == -ENOSPC)
// For now, if we hit _any_ ENOSPC, crash, before we do any damage
// by partially applying transactions.
msg = "ENOSPC handling not implemented";
msg = "ENOSPC from key value store, misconfigured cluster";

if (r == -ENOTEMPTY) {
msg = "ENOTEMPTY suggests garbage data in osd data dir";
Expand Down
2 changes: 1 addition & 1 deletion src/os/memstore/MemStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,7 @@ void MemStore::_do_transaction(Transaction& t)
if (r == -ENOSPC)
// For now, if we hit _any_ ENOSPC, crash, before we do any damage
// by partially applying transactions.
msg = "ENOSPC handling not implemented";
msg = "ENOSPC from MemStore, misconfigured cluster or insufficient memory";

if (r == -ENOTEMPTY) {
msg = "ENOTEMPTY suggests garbage data in osd data dir";
Expand Down
4 changes: 2 additions & 2 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -781,9 +781,9 @@ void OSDService::check_nearfull_warning(const osd_stat_t &osd_stat)
}
last_msg = now;
if (cur_state == FULL)
clog->error() << "OSD full dropping all updates " << (int)(ratio * 100) << "% full";
clog->error() << "OSD full dropping all updates " << (int)roundf(ratio * 100) << "% full";
else
clog->warn() << "OSD near full (" << (int)(ratio * 100) << "%)";
clog->warn() << "OSD near full (" << (int)roundf(ratio * 100) << "%)";
}

bool OSDService::check_failsafe_full()
Expand Down

0 comments on commit b50909c

Please sign in to comment.