Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

librbd: Zipkin tracing [GSOC 2016] #10637

Closed
wants to merge 33 commits into from
Closed
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5b5d443
Build support for BlkKin (LTTng + Zipkin) tracing
agshew Mar 11, 2015
df3fdab
cmake: add module and WITH_BLKIN option for blkin
cbodley Aug 17, 2015
9568720
blkin: add header and initialization
cbodley Aug 20, 2015
589dafe
msg: pass Connection to decode_message
cbodley Oct 27, 2015
1fd89fa
blkin: Messenger integration
cbodley Aug 17, 2015
fb282d2
blkin: add traces to SimpleMessenger
cbodley Aug 25, 2015
67287b8
blkin: add traces to XioMessenger
cbodley Aug 25, 2015
1bc46f2
blkin: add traces to AsyncMessenger
cbodley Aug 25, 2015
b78200f
blkin: osd op messages carry trace information
cbodley Aug 25, 2015
b1143c7
blkin: set up tracing in the OSD
cbodley Aug 28, 2015
a87324b
blkin: set up tracing in PGs
cbodley Aug 28, 2015
1bcb22a
blkin: add traces to ReplicatedBackend
cbodley Aug 28, 2015
b6392ab
blkin: add traces to ECBackend
cbodley Aug 27, 2015
6ff1880
blkin: add traces to FileStore/Journal
cbodley Aug 28, 2015
8c612fc
blkin: add config variable osd_blkin_trace_all
cbodley Oct 27, 2015
c1094ab
blkin: add traces to librados and Objecter
cbodley Oct 23, 2015
dda790f
blkin: add config variable osdc_blkin_trace_all
cbodley Nov 12, 2015
a1bc03c
cmake: Add blkin libs to target_link_libraries
vh4x Jun 25, 2016
17509bd
Pass trace information for aio_writes within librbd.
vh4x Jul 15, 2016
bfc1367
Incorporate librados C changes from commit f9aea0cd33f071409c5653e0ac…
vh4x Jul 18, 2016
e62b40e
Modify aio_operate to take trace information.
vh4x Jul 19, 2016
08b08be
Pass trace information from rbd_aio_read_traced to aio_operate_read()
vh4x Jul 23, 2016
90ecc6a
librbd: Rebase template changes in librbd.
vear91 Aug 5, 2016
9de5eed
librbd: Pass trace information in AioObjectRequests
vear91 Aug 8, 2016
b7d46ac
Add the blkin tracing library as a submodule.
vear91 Aug 10, 2016
3eefd8c
Add tests for rbd_aio_write_traced and rbd_aio_write_traced in test_l…
vear91 Aug 11, 2016
d36519d
Set CEPH_FEATURE_BLKIN_TRACING to unused bit 59.
vear91 Aug 11, 2016
1da3bcf
Overload aio_operate instead of using an optional parameter for trace…
vear91 Aug 15, 2016
efa54c4
osd: encode trace as part of the partial chunk of MOSDOp
cbodley Aug 17, 2016
2451abf
librados: Record events for zipkin root span initialization
vear91 Aug 18, 2016
c2f6bdb
librados: Remove the blkin_trace_info parameters in LIbradosTestStub.cc
vear91 Aug 18, 2016
9b75fe4
rbd: Call the aio_operate overload when there is trace information.
vear91 Aug 18, 2016
1a87bbf
rbd: Avoid storing blkin_trace_info*
vear91 Aug 18, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Expand Up @@ -33,3 +33,6 @@
[submodule "src/isa-l"]
path = src/isa-l
url = https://github.com/ceph/isa-l
[submodule "src/blkin"]
path = src/blkin
url = https://github.com/linuxbox2/blkin
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we move this under the ceph github org? I can create the repo there

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

6 changes: 6 additions & 0 deletions CMakeLists.txt
Expand Up @@ -324,6 +324,12 @@ if(WITH_XIO)
set(HAVE_XIO ${XIO_FOUND})
endif(WITH_XIO)

option(WITH_BLKIN "Use blkin to emit LTTng tracepoints for Zipkin" OFF)
if(WITH_BLKIN)
find_package(blkin REQUIRED)
include_directories(${BLKIN_INCLUDE_DIR})
endif(WITH_BLKIN)

#option for RGW
option(WITH_RADOSGW "Rados Gateway is enabled" ON)

Expand Down
23 changes: 23 additions & 0 deletions cmake/modules/Findblkin.cmake
@@ -0,0 +1,23 @@
# - Try to find blkin
# Once done this will define
# BLKIN_FOUND - System has blkin
# BLKIN_INCLUDE_DIR - The blkin include directories
# BLKIN_LIBRARIES - The libraries needed to use blkin

find_package(PkgConfig)
pkg_check_modules(PC_BLKIN QUIET libblkin)

find_path(BLKIN_INCLUDE_DIR ztracer.hpp
HINTS ${PC_BLKIN_INCLUDEDIR} ${PC_BLKIN_INCLUDE_DIRS}
PATH_SUFFIXES blkin)
find_library(BLKIN_LIBRARY NAMES blkin
HINTS ${PC_BLKIN_LIBDIR} ${PC_BLKIN_LIBRARY_DIRS})

include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set BLKIN_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(blkin DEFAULT_MSG
BLKIN_LIBRARY BLKIN_INCLUDE_DIR)

set(BLKIN_LIBRARIES ${BLKIN_LIBRARY} lttng-ust)
mark_as_advanced(BLKIN_INCLUDE_DIR BLKIN_LIBRARIES)
11 changes: 11 additions & 0 deletions configure.ac
Expand Up @@ -528,6 +528,17 @@ if test "x$enable_coverage" != xno; then
fi
AC_SUBST(GCOV_PREFIX_STRIP, `echo $(pwd)/src | tr -dc / | wc -c`)

# blkin (lttng+zipkin) tracing?
AC_ARG_WITH([blkin],
[AS_HELP_STRING([--with-blkin], [blkin (lttng + zipkin) tracing])],
[],
[with_blkin=no])
have_blkin=no
AS_IF([test "x$with_blkin" == "xyes"],
[PKG_CHECK_MODULES([BLKIN], [blkin], [have_blkin=yes])])
AM_CONDITIONAL(WITH_BLKIN, test "x$have_blkin" == xyes)
AM_COND_IF([WITH_BLKIN], [AC_DEFINE([WITH_BLKIN], [1], [Defined if using BlkKin])])

# is radosgw available?
RADOSGW=0
AS_IF([test "x$with_radosgw" != xno],
Expand Down
4 changes: 3 additions & 1 deletion do_autogen.sh
Expand Up @@ -4,6 +4,7 @@ usage() {
cat <<EOF
do_autogen.sh: make a ceph build by running autogen, etc.

-b blkin tracing
-C <parameter> add parameters to configure
-c use cryptopp
-d <level> debug build
Expand Down Expand Up @@ -36,9 +37,10 @@ verbose=0
profile=0
rocksdb=1
CONFIGURE_FLAGS="--disable-static --with-lttng"
while getopts "C:cd:e:hjJLO:pPRTv" flag
while getopts "bC:cd:e:hjJLO:pPRTv" flag
do
case $flag in
b) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-blkin";;
C) CONFIGURE_FLAGS="$CONFIGURE_FLAGS $OPTARG";;
c) CONFIGURE_FLAGS="$CONFIGURE_FLAGS --with-cryptopp --without-nss";;
d) debug_level=$OPTARG;;
Expand Down
167 changes: 167 additions & 0 deletions doc/dev/blkin.rst
@@ -0,0 +1,167 @@
=========================
Tracing Ceph With BlkKin
=========================

Ceph can use Blkin, a library created by Marios Kogias and others,
which enables tracking a specific request from the time it enters
the system at higher levels till it is finally served by RADOS.

In general, Blkin implements the Dapper_ tracing semantics
in order to show the causal relationships between the different
processing phases that an IO request may trigger. The goal is an
end-to-end visualisation of the request's route in the system,
accompanied by information concerning latencies in each processing
phase. Thanks to LTTng this can happen with a minimal overhead and
in realtime. The LTTng traces can then be visualized with Twitter's
Zipkin_.

.. _Dapper: http://static.googleusercontent.com/media/research.google.com/el//pubs/archive/36356.pdf
.. _Zipkin: http://twitter.github.io/zipkin/


Installing Blkin
================

You can install Markos Kogias' upstream Blkin_ by hand.::

cd blkin/
make && make install

or build distribution packages using DistroReadyBlkin_, which also comes with
pkgconfig support. If you choose the latter, then you must generate the
configure and make files first.::

cd blkin
autoreconf -i

.. _Blkin: https://github.com/marioskogias/blkin
.. _DistroReadyBlkin: https://github.com/agshew/blkin


Configuring Ceph with Blkin
===========================

If you built and installed Blkin by hand, rather than building and
installing packages, then set these variables before configuring
Ceph.::

export BLKIN_CFLAGS=-Iblkin/
export BLKIN_LIBS=-lzipkin-cpp

Since there are separate lttng and blkin changes to Ceph, you may
want to configure with something like::

./configure --with-blkin --without-lttng --with-debug


Testing Blkin
=============

It's easy to test Ceph's Blkin tracing. Let's assume you don't have
Ceph already running, and you compiled Ceph with Blkin support but
you did't install it. Then launch Ceph with the ``vstart.sh`` script
in Ceph's src directgory so you can see the possible tracepoints.::

cd src
OSD=3 MON=3 RGW=1 ./vstart.sh -n
lttng list --userspace

You'll see something like the following:::

UST events:
-------------
PID: 8987 - Name: ./ceph-osd
zipkin:timestamp (loglevel: TRACE_WARNING (4)) (type: tracepoint)
zipkin:keyval (loglevel: TRACE_WARNING (4)) (type: tracepoint)
ust_baddr_statedump:soinfo (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)

PID: 8407 - Name: ./ceph-mon
zipkin:timestamp (loglevel: TRACE_WARNING (4)) (type: tracepoint)
zipkin:keyval (loglevel: TRACE_WARNING (4)) (type: tracepoint)
ust_baddr_statedump:soinfo (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)

...

Next, stop Ceph so that the tracepoints can be enabled.::

./stop.sh

Start up an LTTng session and enable the tracepoints.::

lttng create blkin-test
lttng enable-event --userspace zipkin:timestamp
lttng enable-event --userspace zipkin:keyval
lttng start

Then start up Ceph again.::

OSD=3 MON=3 RGW=1 ./vstart.sh -n

You may want to check that ceph is up.::

./ceph status

Now put something in usin rados, check that it made it, get it back, and remove it.::

./rados mkpool test-blkin
./rados put test-object-1 ./vstart.sh --pool=test-blkin
./rados -p test-blkin ls
./ceph osd map test-blkin test-object-1
./rados get test-object-1 ./vstart-copy.sh --pool=test-blkin
md5sum vstart*
./rados rm test-object-1 --pool=test-blkin

You could also use the example in ``examples/librados/`` or ``rados bench``.

Then stop the LTTng session and see what was collected.::

lttng stop
lttng view

You'll see something like:::

[13:09:07.755054973] (+?.?????????) scruffy zipkin:timestamp: { cpu_id = 5 }, { trace_name = "Main", service_name = "MOSDOp", port_no = 0, ip = "0.0.0.0", trace_id = 7492589359882233221, span_id = 2694140257089376129, parent_span_id = 0, event = "Message allocated" }
[13:09:07.755071569] (+0.000016596) scruffy zipkin:keyval: { cpu_id = 5 }, { trace_name = "Main", service_name = "MOSDOp", port_no = 0, ip = "0.0.0.0", trace_id = 7492589359882233221, span_id = 2694140257089376129, parent_span_id = 0, key = "Type", val = "MOSDOp" }
[13:09:07.755074217] (+0.000002648) scruffy zipkin:keyval: { cpu_id = 5 }, { trace_name = "Main", service_name = "MOSDOp", port_no = 0, ip = "0.0.0.0", trace_id = 7492589359882233221, span_id = 2694140257089376129, parent_span_id = 0, key = "Reqid", val = "client.4126.0:1" }
...


Install Zipkin
===============
One of the points of using Blkin is so that you can look at the traces
using Zipkin. Users should run Zipkin as a tracepoints collector and
also a web service, which means users need to run three services,
zipkin-collector, zipkin-query and zipkin-web.

Download Zipkin Package::

wget https://github.com/twitter/zipkin/archive/1.1.0.tar.gz
tar zxf 1.1.0.tar.gz
cd zipkin-1.1.0
bin/collector cassandra &
bin/query cassandra &
bin/web &

Check Zipkin::

bin/test
Browse http://${zipkin-web-ip}:8080


Show Ceph's Blkin Traces in Zipkin-web
======================================
Blkin provides a script which translates lttng result to Zipkin
(Dapper) semantics.

Send lttng data to Zipkin::

python3 babeltrace_zipkin.py ${lttng-traces-dir}/${blkin-test}/ust/uid/0/64-bit/ -p ${zipkin-collector-port(9410 by default)} -s ${zipkin-collector-ip}

Example::

python3 babeltrace_zipkin.py ~/lttng-traces-dir/blkin-test-20150225-160222/ust/uid/0/64-bit/ -p 9410 -s 127.0.0.1

Check Ceph traces on webpage::

Browse http://${zipkin-web-ip}:8080
Click "Find traces"
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Expand Up @@ -523,6 +523,7 @@ endif(HAVE_ARMV8_CRC)

add_library(common_utf8 STATIC common/utf8.c)

target_link_libraries(common json_spirit common_utf8 erasure_code rt uuid resolv ${CRYPTO_LIBS} ${Boost_LIBRARIES} ${BLKID_LIBRARIES} ${EXECINFO_LIBRARIES} ${BLKIN_LIBRARIES})
if(${WITH_LTTNG})
add_subdirectory(tracing)
endif(${WITH_LTTNG})
Expand Down
6 changes: 6 additions & 0 deletions src/Makefile-env.am
Expand Up @@ -257,6 +257,12 @@ if ENABLE_COVERAGE
EXTRALIBS += -lgcov
endif # ENABLE_COVERAGE

if WITH_BLKIN
AM_CPPFLAGS += -DWITH_BLKIN
AM_CXXFLAGS += $(BLKIN_CFLAGS)
EXTRALIBS += $(BLKIN_LIBS)
endif

# Libosd always needs osdc and os
LIBOSD += $(LIBOSDC) $(LIBOS)

Expand Down
6 changes: 6 additions & 0 deletions src/Makefile-server.am
Expand Up @@ -21,6 +21,9 @@ if WITH_MON

ceph_mon_SOURCES = ceph_mon.cc
ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) $(LIBCOMMON) $(LIBAUTH) $(LIBCOMMON) $(LIBMON_TYPES)
if WITH_BLKIN
ceph_mon_LDADD += $(BLKIN_LIBS)
endif
bin_PROGRAMS += ceph-mon

endif # WITH_MON
Expand All @@ -42,6 +45,9 @@ ceph_osd_LDADD = \
if WITH_LTTNG
ceph_osd_LDADD += $(LIBOSD_TP)
endif
if WITH_BLKIN
ceph_osd_LDADD += $(BLKIN_LIBS)
endif
bin_PROGRAMS += ceph-osd

endif # WITH_OSD
Expand Down
1 change: 1 addition & 0 deletions src/blkin
Submodule blkin added at ac55e4
5 changes: 3 additions & 2 deletions src/client/ObjecterWriteback.h
Expand Up @@ -17,7 +17,8 @@ class ObjecterWriteback : public WritebackHandler {
virtual void read(const object_t& oid, uint64_t object_no,
const object_locator_t& oloc, uint64_t off, uint64_t len,
snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
__u32 trunc_seq, int op_flags, Context *onfinish) {
__u32 trunc_seq, int op_flags, Context *onfinish,
const blkin_trace_info *trace_info = nullptr) {
m_objecter->read_trunc(oid, oloc, off, len, snapid, pbl, 0,
trunc_size, trunc_seq,
new C_OnFinisher(new C_Lock(m_lock, onfinish),
Expand All @@ -34,7 +35,7 @@ class ObjecterWriteback : public WritebackHandler {
const SnapContext& snapc, const bufferlist &bl,
ceph::real_time mtime, uint64_t trunc_size,
__u32 trunc_seq, ceph_tid_t journal_tid,
Context *oncommit) {
Context *oncommit, const blkin_trace_info *trace_info = nullptr) {
return m_objecter->write_trunc(oid, oloc, off, len, snapc, bl, mtime, 0,
trunc_size, trunc_seq, NULL,
new C_OnFinisher(new C_Lock(m_lock,
Expand Down
6 changes: 6 additions & 0 deletions src/common/TrackedOp.h
Expand Up @@ -18,6 +18,7 @@
#include <include/utime.h>
#include "common/Mutex.h"
#include "common/histogram.h"
#include "common/zipkin_trace.h"
#include "include/xlist.h"
#include "msg/Message.h"
#include "include/memory.h"
Expand Down Expand Up @@ -182,6 +183,11 @@ class TrackedOp {
virtual void _unregistered() {};

public:
ZTracer::Trace osd_trace;
ZTracer::Trace pg_trace;
ZTracer::Trace store_trace;
ZTracer::Trace journal_trace;

virtual ~TrackedOp() {}

const utime_t& get_initiated() const {
Expand Down
2 changes: 2 additions & 0 deletions src/common/common_init.cc
Expand Up @@ -24,6 +24,7 @@
#include "common/safe_io.h"
#include "common/valgrind.h"
#include "common/version.h"
#include "common/zipkin_trace.h"
#include "include/color.h"

#include <errno.h>
Expand Down Expand Up @@ -123,6 +124,7 @@ void complain_about_parse_errors(CephContext *cct,
void common_init_finish(CephContext *cct)
{
cct->init_crypto();
ZTracer::ztrace_init();

int flags = cct->get_init_flags();
if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
Expand Down
3 changes: 3 additions & 0 deletions src/common/config_opts.h
Expand Up @@ -905,6 +905,9 @@ OPTION(osd_bench_large_size_max_throughput, OPT_U64, 100 << 20) // 100 MB/s
OPTION(osd_bench_max_block_size, OPT_U64, 64 << 20) // cap the block size at 64MB
OPTION(osd_bench_duration, OPT_U32, 30) // duration of 'osd bench', capped at 30s to avoid triggering timeouts

OPTION(osd_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all osd requests
OPTION(osdc_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all objecter requests

OPTION(osd_discard_disconnected_ops, OPT_BOOL, true)

OPTION(memstore_device_bytes, OPT_U64, 1024*1024*1024)
Expand Down