Skip to content

Commit

Permalink
osd: Warn about objects with too many omap entries
Browse files Browse the repository at this point in the history
Signed-off-by: Brad Hubbard <bhubbard@redhat.com>
  • Loading branch information
badone committed Oct 24, 2017
1 parent 63ef97b commit 71bf047
Show file tree
Hide file tree
Showing 11 changed files with 293 additions and 21 deletions.
@@ -0,0 +1,22 @@
roles:
- [mon.a, mgr.x, osd.0, osd.1, client.0]
overrides:
ceph:
log-whitelist:
- \(OSDMAP_FLAGS\)
- \(OSD_FULL\)
- \(MDS_READ_ONLY\)
- large omap objects
- Large omap object found
- application not enabled
conf:
osd:
osd deep scrub large omap object value sum threshold: 8800000
osd deep scrub large omap object key threshold: 20000
tasks:
- install:
- ceph:
- workunit:
clients:
all:
- rados/test_large_omap_detection.py
130 changes: 130 additions & 0 deletions qa/workunits/rados/test_large_omap_detection.py
@@ -0,0 +1,130 @@
#!/usr/bin/python
# -*- mode:python -*-
# vim: ts=4 sw=4 smarttab expandtab
#
# Copyright (C) 2017 Red Hat <contact@redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library Public License for more details.
#

import json
import rados
import shlex
import subprocess
import time

def cleanup(cluster):
cluster.delete_pool('large-omap-test-pool')
cluster.shutdown()

def init():
# For local testing
#cluster = rados.Rados(conffile='./ceph.conf')
cluster = rados.Rados(conffile='/etc/ceph/ceph.conf')
cluster.connect()
print("\nCluster ID: " + cluster.get_fsid())
cluster.create_pool('large-omap-test-pool')
ioctx = cluster.open_ioctx('large-omap-test-pool')
ioctx.write_full('large-omap-test-object1', "Lorem ipsum")
op = ioctx.create_write_op()

keys = []
values = []
for x in range(20001):
keys.append(str(x))
values.append("X")

ioctx.set_omap(op, tuple(keys), tuple(values))
ioctx.operate_write_op(op, 'large-omap-test-object1', 0)
ioctx.release_write_op(op)

ioctx.write_full('large-omap-test-object2', "Lorem ipsum dolor")
op = ioctx.create_write_op()

buffer = ("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do "
"eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut "
"enim ad minim veniam, quis nostrud exercitation ullamco laboris "
"nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in "
"reprehenderit in voluptate velit esse cillum dolore eu fugiat "
"nulla pariatur. Excepteur sint occaecat cupidatat non proident, "
"sunt in culpa qui officia deserunt mollit anim id est laborum.")

keys = []
values = []
for x in xrange(20000):
keys.append(str(x))
values.append(buffer)

ioctx.set_omap(op, tuple(keys), tuple(values))
ioctx.operate_write_op(op, 'large-omap-test-object2', 0)
ioctx.release_write_op(op)
ioctx.close()
return cluster

def get_deep_scrub_timestamp(pgid):
cmd = ['ceph', 'pg', 'dump', '--format=json-pretty']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
out = proc.communicate()[0]
for stat in json.loads(out)['pg_stats']:
if stat['pgid'] == pgid:
return stat['last_deep_scrub_stamp']

def wait_for_scrub():
osds = set();
pgs = dict();
cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
'large-omap-test-object1', '--format=json-pretty']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
out = proc.communicate()[0]
osds.add(json.loads(out)['acting_primary'])
pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])
cmd = ['ceph', 'osd', 'map', 'large-omap-test-pool',
'large-omap-test-object2', '--format=json-pretty']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
out = proc.communicate()[0]
osds.add(json.loads(out)['acting_primary'])
pgs[json.loads(out)['pgid']] = get_deep_scrub_timestamp(json.loads(out)['pgid'])

for osd in osds:
command = "ceph osd deep-scrub osd." + str(osd)
subprocess.check_call(shlex.split(command))

for pg in pgs:
RETRIES = 0
while RETRIES < 60 and pgs[pg] == get_deep_scrub_timestamp(pg):
time.sleep(10)
RETRIES += 1

def check_health_output():
RETRIES = 0
result = 0
while RETRIES < 6 and result != 2:
result = 0
RETRIES += 1
output = subprocess.check_output(["ceph", "health", "detail"])
for line in output.splitlines():
result += int(line.find('2 large omap objects') != -1)
time.sleep(10)

if result != 2:
print("Error, got invalid output:")
print(output)
raise Exception

def main():
cluster = init()
wait_for_scrub()
check_health_output()

cleanup(cluster)

if __name__ == '__main__':
main()
12 changes: 12 additions & 0 deletions src/common/options.cc
Expand Up @@ -2523,6 +2523,18 @@ std::vector<Option> get_global_options() {
.set_default(2_hr)
.set_description(""),

Option("osd_deep_scrub_large_omap_object_key_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(2000000)
.set_description("threshold for number of keys to determine a large omap object")
.add_service("osd")
.add_see_also("osd_deep_scrub_large_omap_object_value_sum_threshold"),

Option("osd_deep_scrub_large_omap_object_value_sum_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(1_G)
.set_description("threshold for summed size (bytes) of all key values to determine a large omap object")
.add_service("osd")
.add_see_also("osd_deep_scrub_large_omap_object_key_threshold"),

Option("osd_class_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default(CEPH_LIBDIR "/rados-classes")
.set_description(""),
Expand Down
33 changes: 33 additions & 0 deletions src/mon/PGMap.cc
Expand Up @@ -2341,6 +2341,39 @@ void PGMap::get_health_checks(
checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
}

// LARGE_OMAP_OBJECTS
if (pg_sum.stats.sum.num_large_omap_objects) {
list<string> detail;
for (auto &pool : pools) {
const string& pool_name = osdmap.get_pool_name(pool.first);
auto it2 = pg_pool_sum.find(pool.first);
if (it2 == pg_pool_sum.end()) {
continue;
}
const pool_stat_t *pstat = &it2->second;
if (pstat == nullptr) {
continue;
}
const object_stat_sum_t& sum = pstat->stats.sum;
if (sum.num_large_omap_objects) {
stringstream ss;
ss << sum.num_large_omap_objects << " large objects found in pool "
<< "'" << pool_name << "'";
detail.push_back(ss.str());
}
}
if (!detail.empty()) {
ostringstream ss;
ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
stringstream tip;
tip << "Search the cluster log for 'Large omap object found' for more "
<< "details.";
detail.push_back(tip.str());
d.detail.swap(detail);
}
}

// CACHE_POOL_NEAR_FULL
{
list<string> detail;
Expand Down
33 changes: 19 additions & 14 deletions src/osd/PG.cc
Expand Up @@ -4644,38 +4644,42 @@ void PG::scrub_compare_maps()
map<pg_shard_t, ScrubMap *> maps;
maps[pg_whoami] = &scrubber.primary_scrubmap;

for (set<pg_shard_t>::iterator i = actingbackfill.begin();
i != actingbackfill.end();
++i) {
if (*i == pg_whoami) continue;
dout(2) << __func__ << " replica " << *i << " has "
<< scrubber.received_maps[*i].objects.size()
for (const auto& i : actingbackfill) {
if (i == pg_whoami) continue;
dout(2) << __func__ << " replica " << i << " has "
<< scrubber.received_maps[i].objects.size()
<< " items" << dendl;
maps[*i] = &scrubber.received_maps[*i];
maps[i] = &scrubber.received_maps[i];
}

map<hobject_t,ScrubMap::object>::const_iterator i;
map<pg_shard_t, ScrubMap *>::const_iterator j;
set<hobject_t> master_set;

// Construct master set
for (j = maps.begin(); j != maps.end(); ++j) {
for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) {
master_set.insert(i->first);
for (const auto map : maps) {
for (const auto i : map.second->objects) {
master_set.insert(i.first);
}
}

stringstream ss;
get_pgbackend()->be_large_omap_check(maps, master_set,
scrubber.large_omap_objects, ss);
if (!ss.str().empty()) {
osd->clog->warn(ss);
}

if (acting.size() > 1) {
dout(10) << __func__ << " comparing replica scrub maps" << dendl;

stringstream ss;

// Map from object with errors to good peer
map<hobject_t, list<pg_shard_t>> authoritative;

dout(2) << __func__ << " osd." << acting[0] << " has "
<< scrubber.primary_scrubmap.objects.size() << " items" << dendl;

ss.str("");
ss.clear();

get_pgbackend()->be_compare_scrubmaps(
maps,
master_set,
Expand Down Expand Up @@ -4872,6 +4876,7 @@ void PG::scrub_finish()
info.history.last_clean_scrub_stamp = now;
info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
} else {
info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
// XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
Expand Down
2 changes: 2 additions & 0 deletions src/osd/PG.h
Expand Up @@ -1461,6 +1461,7 @@ class PG : public DoutPrefixProvider {
set<pg_shard_t> waiting_on_whom;
int shallow_errors;
int deep_errors;
int large_omap_objects = 0;
int fixed;
ScrubMap primary_scrubmap;
map<pg_shard_t, ScrubMap> received_maps;
Expand Down Expand Up @@ -1576,6 +1577,7 @@ class PG : public DoutPrefixProvider {
subset_last_update = eversion_t();
shallow_errors = 0;
deep_errors = 0;
large_omap_objects = 0;
fixed = 0;
deep = false;
seed = 0;
Expand Down
20 changes: 20 additions & 0 deletions src/osd/PGBackend.cc
Expand Up @@ -1098,3 +1098,23 @@ void PGBackend::be_compare_scrubmaps(
}
}
}

void PGBackend::be_large_omap_check(const map<pg_shard_t,ScrubMap*> &maps,
const set<hobject_t> &master_set,
int& large_omap_objects,
ostream &warnstream) const
{
// Iterate through objects and check large omap object flag
for (const auto& k : master_set) {
for (const auto& map : maps) {
ScrubMap::object& obj = map.second->objects[k];
if (obj.large_omap_object_found) {
large_omap_objects++;
warnstream << "Large omap object found. Object: " << k << " Key count: "
<< obj.large_omap_object_key_count << " Size (bytes): "
<< obj.large_omap_object_value_size << '\n';
break;
}
}
}
}
5 changes: 5 additions & 0 deletions src/osd/PGBackend.h
Expand Up @@ -591,6 +591,11 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
uint32_t seed,
ScrubMap::object &o,
ThreadPool::TPHandle &handle) = 0;
void be_large_omap_check(
const map<pg_shard_t,ScrubMap*> &maps,
const set<hobject_t> &master_set,
int& large_omap_objects,
ostream &warnstream) const;

static PGBackend *build_pg_backend(
const pg_pool_t &pool,
Expand Down
17 changes: 17 additions & 0 deletions src/osd/ReplicatedBackend.cc
Expand Up @@ -769,20 +769,37 @@ void ReplicatedBackend::be_deep_scrub(
ghobject_t(
poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
assert(iter);
uint64_t keys_scanned = 0;
uint64_t value_sum = 0;
for (iter->seek_to_first(); iter->status() == 0 && iter->valid();
iter->next(false)) {
++keys_scanned;
handle.reset_tp_timeout();

dout(25) << "CRC key " << iter->key() << " value:\n";
iter->value().hexdump(*_dout);
*_dout << dendl;

value_sum += iter->value().length();

::encode(iter->key(), bl);
::encode(iter->value(), bl);
oh << bl;
bl.clear();
}

if (keys_scanned > cct->_conf->get_val<uint64_t>(
"osd_deep_scrub_large_omap_object_key_threshold") ||
value_sum > cct->_conf->get_val<uint64_t>(
"osd_deep_scrub_large_omap_object_value_sum_threshold")) {
dout(25) << __func__ << " " << poid
<< " large omap object detected. Object has " << keys_scanned
<< " keys and size " << value_sum << " bytes" << dendl;
o.large_omap_object_found = true;
o.large_omap_object_key_count = keys_scanned;
o.large_omap_object_value_size = value_sum;
}

if (iter->status() < 0) {
dout(25) << __func__ << " " << poid
<< " on omap scan, db status error" << dendl;
Expand Down

0 comments on commit 71bf047

Please sign in to comment.