Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osd: support dumping long ops #13019

Merged
merged 2 commits into from
Apr 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/common/TrackedOp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void OpHistory::on_shutdown()
Mutex::Locker history_lock(ops_history_lock);
arrived.clear();
duration.clear();
slow_op.clear();
shutdown = true;
}

Expand All @@ -44,6 +45,8 @@ void OpHistory::insert(utime_t now, TrackedOpRef op)
return;
duration.insert(make_pair(op->get_duration(), op));
arrived.insert(make_pair(op->get_initiated(), op));
if (op->get_duration() >= history_slow_op_threshold)
slow_op.insert(make_pair(op->get_initiated(), op));
cleanup(now);
}

Expand All @@ -64,6 +67,12 @@ void OpHistory::cleanup(utime_t now)
duration.begin()->second));
duration.erase(duration.begin());
}

while (slow_op.size() > history_slow_op_size) {
slow_op.erase(make_pair(
slow_op.begin()->second->get_initiated(),
slow_op.begin()->second));
}
}

void OpHistory::dump_ops(utime_t now, Formatter *f)
Expand Down Expand Up @@ -165,6 +174,39 @@ bool OpTracker::dump_historic_ops(Formatter *f, bool by_duration)
return true;
}

void OpHistory::dump_slow_ops(utime_t now, Formatter *f)
{
Mutex::Locker history_lock(ops_history_lock);
cleanup(now);
f->open_object_section("OpHistory slow ops");
f->dump_int("num to keep", history_slow_op_size);
f->dump_int("threshold to keep", history_slow_op_threshold);
{
f->open_array_section("Ops");
for (set<pair<utime_t, TrackedOpRef> >::const_iterator i =
slow_op.begin();
i != slow_op.end();
++i) {
f->open_object_section("Op");
i->second->dump(now, f);
f->close_section();
}
f->close_section();
}
f->close_section();
}

bool OpTracker::dump_historic_slow_ops(Formatter *f)
{
RWLock::RLocker l(lock);
if (!tracking_enabled)
return false;

utime_t now = ceph_clock_now();
history.dump_slow_ops(now, f);
return true;
}

bool OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked)
{
RWLock::RLocker l(lock);
Expand Down
16 changes: 15 additions & 1 deletion src/common/TrackedOp.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,27 +33,37 @@ typedef boost::intrusive_ptr<TrackedOp> TrackedOpRef;
class OpHistory {
set<pair<utime_t, TrackedOpRef> > arrived;
set<pair<double, TrackedOpRef> > duration;
set<pair<utime_t, TrackedOpRef> > slow_op;
Mutex ops_history_lock;
void cleanup(utime_t now);
bool shutdown;
uint32_t history_size;
uint32_t history_duration;
uint32_t history_slow_op_size;
uint32_t history_slow_op_threshold;

public:
OpHistory() : ops_history_lock("OpHistory::Lock"), shutdown(false),
history_size(0), history_duration(0) {}
history_size(0), history_duration(0),
history_slow_op_size(0), history_slow_op_threshold(0) {}
~OpHistory() {
assert(arrived.empty());
assert(duration.empty());
assert(slow_op.empty());
}
void insert(utime_t now, TrackedOpRef op);
void dump_ops(utime_t now, Formatter *f);
void dump_ops_by_duration(utime_t now, Formatter *f);
void dump_slow_ops(utime_t now, Formatter *f);
void on_shutdown();
void set_size_and_duration(uint32_t new_size, uint32_t new_duration) {
history_size = new_size;
history_duration = new_duration;
}
void set_slow_op_size_and_threshold(uint32_t new_size, uint32_t new_threshold) {
history_slow_op_size = new_size;
history_slow_op_threshold = new_threshold;
}
};

struct ShardedTrackingData;
Expand All @@ -79,12 +89,16 @@ class OpTracker {
void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
history.set_size_and_duration(new_size, new_duration);
}
void set_history_slow_op_size_and_threshold(uint32_t new_size, uint32_t new_threshold) {
history.set_slow_op_size_and_threshold(new_size, new_threshold);
}
void set_tracking(bool enable) {
RWLock::WLocker l(lock);
tracking_enabled = enable;
}
bool dump_ops_in_flight(Formatter *f, bool print_only_blocked=false);
bool dump_historic_ops(Formatter *f, bool by_duration = false);
bool dump_historic_slow_ops(Formatter *f);
bool register_inflight_op(TrackedOp *i);
void unregister_inflight_op(TrackedOp *i);

Expand Down
2 changes: 2 additions & 0 deletions src/common/config_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,8 @@ OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking
OPTION(osd_num_op_tracker_shard, OPT_U32, 32) // The number of shards for holding the ops
OPTION(osd_op_history_size, OPT_U32, 20) // Max number of completed ops to track
OPTION(osd_op_history_duration, OPT_U32, 600) // Oldest completed op to track
OPTION(osd_op_history_slow_op_size, OPT_U32, 20) // Max number of slow ops to track
OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE, 10.0) // track the op if over this threshold
OPTION(osd_target_transaction_size, OPT_INT, 30) // to adjust various transactions that batch smaller items
OPTION(osd_failsafe_full_ratio, OPT_FLOAT, .97) // what % full makes an OSD "full" (failsafe)
OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL, true) // immediately mark OSDs as down once they refuse to accept connections
Expand Down
17 changes: 17 additions & 0 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1883,6 +1883,8 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
cct->_conf->osd_op_log_threshold);
op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
cct->_conf->osd_op_history_duration);
op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
cct->_conf->osd_op_history_slow_op_threshold);
}

OSD::~OSD()
Expand Down Expand Up @@ -1977,6 +1979,11 @@ bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
}
} else if (admin_command == "dump_historic_slow_ops") {
if (!op_tracker.dump_historic_slow_ops(f)) {
ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
}
} else if (admin_command == "dump_op_pq_state") {
f->open_object_section("pq");
op_shardedwq.dump(f);
Expand Down Expand Up @@ -2534,6 +2541,10 @@ void OSD::final_init()
"show the blocked ops currently in flight");
assert(r == 0);
r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
asok_hook,
"show recent ops");
assert(r == 0);
r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
asok_hook,
"show slowest recent ops");
assert(r == 0);
Expand Down Expand Up @@ -2973,6 +2984,7 @@ int OSD::shutdown()
cct->get_admin_socket()->unregister_command("dump_blocked_ops");
cct->get_admin_socket()->unregister_command("dump_historic_ops");
cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
cct->get_admin_socket()->unregister_command("dump_op_pq_state");
cct->get_admin_socket()->unregister_command("dump_blacklist");
cct->get_admin_socket()->unregister_command("dump_watchers");
Expand Down Expand Up @@ -8987,6 +8999,11 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
cct->_conf->osd_op_history_duration);
}
if (changed.count("osd_op_history_slow_op_size") ||
changed.count("osd_op_history_slow_op_threshold")) {
op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
cct->_conf->osd_op_history_slow_op_threshold);
}
if (changed.count("osd_enable_op_tracker")) {
op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
}
Expand Down