From 1adc5fee2efc2b3c90189afffc6f6c918014cad9 Mon Sep 17 00:00:00 2001 From: Gancho Tenev Date: Thu, 8 Sep 2016 13:23:09 -0700 Subject: [PATCH] TS-4834 Expose span failures (1) AIO read/write error counters - proxy.process.cache.span.errors.read - proxy.process.cache.span.errors.write (2) Span related gauges. A span "moves" from "online" (errors==0) to "failing" (errors > 0 && errors <= proxy.config.cache.max_disk_errors) to "offline"(errors > proxy.config.cache.max_disk_errors. "failing" + "offline" + "online" = total number of spans */ - proxy.process.cache.span.failing - proxy.process.cache.span.offline - proxy.process.cache.span.online (3) Added code to count span read/write failures before "cache enabled" (cache ready) --- .../monitoring/statistics/core/cache.en.rst | 28 ++++++++++++++++ iocore/aio/AIO.cc | 9 +++--- iocore/cache/Cache.cc | 23 +++++++++++-- iocore/cache/CacheDisk.cc | 32 +++++++++++++++++++ iocore/cache/I_Cache.h | 2 +- iocore/cache/P_CacheDisk.h | 1 + iocore/cache/P_CacheInternal.h | 10 ++++++ proxy/Main.cc | 2 +- 8 files changed, 98 insertions(+), 9 deletions(-) diff --git a/doc/admin-guide/monitoring/statistics/core/cache.en.rst b/doc/admin-guide/monitoring/statistics/core/cache.en.rst index 9784ec4c43b..7126d0a0963 100644 --- a/doc/admin-guide/monitoring/statistics/core/cache.en.rst +++ b/doc/admin-guide/monitoring/statistics/core/cache.en.rst @@ -199,6 +199,34 @@ Cache .. ts:stat:: global proxy.process.cache.write.failure integer .. ts:stat:: global proxy.process.cache.write_per_sec float .. ts:stat:: global proxy.process.cache.write.success integer + +.. ts:stat:: global proxy.process.cache.span.errors.read integer + + The number of span read errors (counter). + +.. ts:stat:: global proxy.process.cache.span.errors.write integer + + The number of span write errors (counter). + +.. ts:stat:: global proxy.process.cache.span.failing integer + + The number of failing spans (gauge) - errors > 0 && errors < `proxy.config.cache.max_disk_errors`. + + `proxy.process.cache.span.failing` + `proxy.process.cache.span.offline` + `proxy.process.cache.span.online` = total number of spans. + +.. ts:stat:: global proxy.process.cache.span.offline integer + + The number of offline spans (gauge) - errors >= `proxy.config.cache.max_disk_errors`. + + `proxy.process.cache.span.failing` + `proxy.process.cache.span.offline` + `proxy.process.cache.span.online` = total number of spans. + +.. ts:stat:: global proxy.process.cache.span.online integer + + The number of online spans (gauge) - errors == 0. + + `proxy.process.cache.span.failing` + `proxy.process.cache.span.offline` + `proxy.process.cache.span.online` = total number of spans. + + .. ts:stat:: global proxy.process.http.background_fill_bytes_aborted_stat integer :ungathered: diff --git a/iocore/aio/AIO.cc b/iocore/aio/AIO.cc index c4406b07203..396365c416e 100644 --- a/iocore/aio/AIO.cc +++ b/iocore/aio/AIO.cc @@ -489,10 +489,11 @@ aio_thread_main(void *arg) ink_mutex_release(¤t_req->aio_mutex); if (cache_op((AIOCallbackInternal *)op) <= 0) { if (aio_err_callbck) { - AIOCallback *callback_op = new AIOCallbackInternal(); - callback_op->aiocb.aio_fildes = op->aiocb.aio_fildes; - callback_op->mutex = aio_err_callbck->mutex; - callback_op->action = aio_err_callbck; + AIOCallback *callback_op = new AIOCallbackInternal(); + callback_op->aiocb.aio_fildes = op->aiocb.aio_fildes; + callback_op->aiocb.aio_lio_opcode = op->aiocb.aio_lio_opcode; + callback_op->mutex = aio_err_callbck->mutex; + callback_op->action = aio_err_callbck; eventProcessor.schedule_imm(callback_op); } } diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc index e4c5676d2d2..43ae9cb2029 100644 --- a/iocore/cache/Cache.cc +++ b/iocore/cache/Cache.cc @@ -801,6 +801,11 @@ CacheProcessor::diskInitialized() gndisks = gndisks - bad_disks; } + /* Practically just took all bad_disks offline so update the stats. */ + RecSetGlobalRawStatSum(cache_rsb, cache_span_offline_stat, bad_disks); + RecIncrGlobalRawStat(cache_rsb, cache_span_failing_stat, -bad_disks); + RecSetGlobalRawStatSum(cache_rsb, cache_span_online_stat, gndisks); + /* create the cachevol list only if num volumes are greater than 0. */ if (config_volumes.num_volumes == 0) { @@ -1429,6 +1434,7 @@ Vol::handle_dir_clear(int event, void *data) op = (AIOCallback *)data; if ((size_t)op->aio_result != (size_t)op->aiocb.aio_nbytes) { Warning("unable to clear cache directory '%s'", hash_text.get()); + disk->incrErrors(op); fd = -1; } @@ -1548,6 +1554,7 @@ Vol::handle_recover_from_data(int event, void * /* data ATS_UNUSED */) } else if (event == AIO_EVENT_DONE) { if ((size_t)io.aiocb.aio_nbytes != (size_t)io.aio_result) { Warning("disk read error on recover '%s', clearing", hash_text.get()); + disk->incrErrors(&io); goto Lclear; } if (io.aiocb.aio_offset == header->last_write_pos) { @@ -1991,8 +1998,8 @@ Cache::vol_initialized(bool result) /** Set the state of a disk programmatically. */ bool -CacheProcessor::mark_storage_offline(CacheDisk *d ///< Target disk - ) +CacheProcessor::mark_storage_offline(CacheDisk *d, ///< Target disk + bool admin) { bool zret; // indicates whether there's any online storage left. int p; @@ -2015,6 +2022,11 @@ CacheProcessor::mark_storage_offline(CacheDisk *d ///< Target disk RecIncrGlobalRawStat(cache_rsb, cache_direntries_total_stat, -total_dir_delete); RecIncrGlobalRawStat(cache_rsb, cache_direntries_used_stat, -used_dir_delete); + /* Update the span metrics, if failing then move the span from "failing" to "offline" bucket + * if operator took it offline, move it from "online" to "offline" bucket */ + RecIncrGlobalRawStat(cache_rsb, admin ? cache_span_online_stat : cache_span_failing_stat, -1); + RecIncrGlobalRawStat(cache_rsb, cache_span_offline_stat, 1); + if (theCache) { rebuild_host_table(theCache); } @@ -2072,7 +2084,7 @@ AIO_Callback_handler::handle_disk_failure(int /* event ATS_UNUSED */, void *data if (d->fd == cb->aiocb.aio_fildes) { char message[256]; - d->num_errors++; + d->incrErrors(cb); if (!DISK_BAD(d)) { snprintf(message, sizeof(message), "Error accessing Disk %s [%d/%d]", d->path, d->num_errors, cache_config_max_disk_errors); @@ -3175,6 +3187,11 @@ register_cache_stats(RecRawStatBlock *rsb, const char *prefix) REG_INT("sync.count", cache_directory_sync_count_stat); REG_INT("sync.bytes", cache_directory_sync_bytes_stat); REG_INT("sync.time", cache_directory_sync_time_stat); + REG_INT("span.errors.read", cache_span_errors_read_stat); + REG_INT("span.errors.write", cache_span_errors_write_stat); + REG_INT("span.failing", cache_span_failing_stat); + REG_INT("span.offline", cache_span_offline_stat); + REG_INT("span.online", cache_span_online_stat); } void diff --git a/iocore/cache/CacheDisk.cc b/iocore/cache/CacheDisk.cc index 5987ae58c0d..dbea54cc91a 100644 --- a/iocore/cache/CacheDisk.cc +++ b/iocore/cache/CacheDisk.cc @@ -23,6 +23,35 @@ #include "P_Cache.h" +void +CacheDisk::incrErrors(const AIOCallback *io) +{ + if (0 == this->num_errors) { + /* This it the first read/write error on this span since ATS started. + * Move the newly failing span from "online" to "failing" bucket. */ + RecIncrGlobalRawStat(cache_rsb, (int)(cache_span_online_stat), -1); + RecIncrGlobalRawStat(cache_rsb, (int)(cache_span_failing_stat), 1); + } + this->num_errors++; + + const char *opname = "unknown"; + int opcode = io->aiocb.aio_lio_opcode; + int fd = io->aiocb.aio_fildes; + switch (io->aiocb.aio_lio_opcode) { + case LIO_READ: + opname = "READ"; + RecIncrGlobalRawStat(cache_rsb, (int)(cache_span_errors_read_stat), 1); + break; + case LIO_WRITE: + opname = "WRITE"; + RecIncrGlobalRawStat(cache_rsb, (int)(cache_span_errors_write_stat), 1); + break; + default: + break; + } + Warning("failed operation: %s (opcode=%d), span: %s (fd=%d)", opname, opcode, path, fd); +} + int CacheDisk::open(char *s, off_t blocks, off_t askip, int ahw_sector_size, int fildes, bool clear) { @@ -118,6 +147,7 @@ CacheDisk::clearDone(int event, void * /* data ATS_UNUSED */) if ((size_t)io.aiocb.aio_nbytes != (size_t)io.aio_result) { Warning("Could not clear disk header for disk %s: declaring disk bad", path); + incrErrors(&io); SET_DISK_BAD(this); } // update_header(); @@ -133,6 +163,7 @@ CacheDisk::openStart(int event, void * /* data ATS_UNUSED */) if ((size_t)io.aiocb.aio_nbytes != (size_t)io.aio_result) { Warning("could not read disk header for disk %s: declaring disk bad", path); + incrErrors(&io); SET_DISK_BAD(this); SET_HANDLER(&CacheDisk::openDone); return openDone(EVENT_IMMEDIATE, 0); @@ -202,6 +233,7 @@ CacheDisk::syncDone(int event, void * /* data ATS_UNUSED */) if ((size_t)io.aiocb.aio_nbytes != (size_t)io.aio_result) { Warning("Error writing disk header for disk %s:disk bad", path); + incrErrors(&io); SET_DISK_BAD(this); return EVENT_DONE; } diff --git a/iocore/cache/I_Cache.h b/iocore/cache/I_Cache.h index 6ba165c086f..cf19c8db456 100644 --- a/iocore/cache/I_Cache.h +++ b/iocore/cache/I_Cache.h @@ -115,7 +115,7 @@ struct CacheProcessor : public Processor { @note This is what is called if a disk is disabled due to I/O errors. */ - bool mark_storage_offline(CacheDisk *d); + bool mark_storage_offline(CacheDisk *d, bool admin = false); /** Find the storage for a @a path. If @a len is 0 then @a path is presumed null terminated. diff --git a/iocore/cache/P_CacheDisk.h b/iocore/cache/P_CacheDisk.h index b3916252f90..7399b6e6b1d 100644 --- a/iocore/cache/P_CacheDisk.h +++ b/iocore/cache/P_CacheDisk.h @@ -138,6 +138,7 @@ struct CacheDisk : public Continuation { int delete_all_volumes(); void update_header(); DiskVol *get_diskvol(int vol_number); + void incrErrors(const AIOCallback *io); }; #endif diff --git a/iocore/cache/P_CacheInternal.h b/iocore/cache/P_CacheInternal.h index 69ab98ac411..15b8684a293 100644 --- a/iocore/cache/P_CacheInternal.h +++ b/iocore/cache/P_CacheInternal.h @@ -148,6 +148,16 @@ enum { cache_directory_sync_count_stat, cache_directory_sync_time_stat, cache_directory_sync_bytes_stat, + /* AIO read/write error counters */ + cache_span_errors_read_stat, + cache_span_errors_write_stat, + /* Span related gauges. A span "moves" from "online" (errors==0) + * to "failing" (errors > 0 && errors < proxy.config.cache.max_disk_errors) + * to "offline"(errors >= proxy.config.cache.max_disk_errors. + * "failing" + "offline" + "online" = total number of spans */ + cache_span_offline_stat, + cache_span_online_stat, + cache_span_failing_stat, cache_stat_count }; diff --git a/proxy/Main.cc b/proxy/Main.cc index 078558390eb..5815ad82822 100644 --- a/proxy/Main.cc +++ b/proxy/Main.cc @@ -1978,7 +1978,7 @@ mgmt_storage_device_cmd_callback(void *data, char *arg, int len) switch (cmd) { case MGMT_EVENT_STORAGE_DEVICE_CMD_OFFLINE: Debug("server", "Marking %.*s offline", len, arg); - cacheProcessor.mark_storage_offline(d); + cacheProcessor.mark_storage_offline(d, /* admin */ true); break; } }