Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/bin/bash | |
| # | |
| # iolatency - summarize block device I/O latency as a histogram. | |
| # Written using Linux ftrace. | |
| # | |
| # This shows the distribution of latency, allowing modes and latency outliers | |
| # to be identified and studied. | |
| # | |
| # USAGE: ./iolatency [-hQT] [-d device] [-i iotype] [interval [count]] | |
| # | |
| # REQUIREMENTS: FTRACE CONFIG and block:block_rq_* tracepoints, which you may | |
| # already have on recent kernels. | |
| # | |
| # OVERHEAD: block device I/O issue and completion events are traced and buffered | |
| # in-kernel, then processed and summarized in user space. There may be | |
| # measurable overhead with this approach, relative to the block device IOPS. | |
| # | |
| # This was written as a proof of concept for ftrace. | |
| # | |
| # From perf-tools: https://github.com/brendangregg/perf-tools | |
| # | |
| # COPYRIGHT: Copyright (c) 2014 Brendan Gregg. | |
| # | |
| # This program is free software; you can redistribute it and/or | |
| # modify it under the terms of the GNU General Public License | |
| # as published by the Free Software Foundation; either version 2 | |
| # of the License, or (at your option) any later version. | |
| # | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # GNU General Public License for more details. | |
| # | |
| # You should have received a copy of the GNU General Public License | |
| # along with this program; if not, write to the Free Software Foundation, | |
| # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
| # | |
| # (http://www.gnu.org/copyleft/gpl.html) | |
| # | |
| # 20-Jul-2014 Brendan Gregg Created this. | |
| ### default variables | |
| tracing=/sys/kernel/debug/tracing | |
| flock=/var/tmp/.ftrace-lock | |
| bufsize_kb=4096 | |
| opt_device=0; device=; opt_iotype=0; iotype=; opt_timestamp=0 | |
| opt_interval=0; interval=1; opt_count=0; count=0; opt_queue=0 | |
| trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section | |
| function usage { | |
| cat <<-END >&2 | |
| USAGE: iolatency [-hQT] [-d device] [-i iotype] [interval [count]] | |
| -d device # device string (eg, "202,1) | |
| -i iotype # match type (eg, '*R*' for all reads) | |
| -Q # use queue insert as start time | |
| -T # timestamp on output | |
| -h # this usage message | |
| interval # summary interval, seconds (default 1) | |
| count # number of summaries | |
| eg, | |
| iolatency # summarize latency every second | |
| iolatency -Q # include block I/O queue time | |
| iolatency 5 2 # 2 x 5 second summaries | |
| iolatency -i '*R*' # trace reads | |
| iolatency -d 202,1 # trace device 202,1 only | |
| See the man page and example file for more info. | |
| END | |
| exit | |
| } | |
| function warn { | |
| if ! eval "$@"; then | |
| echo >&2 "WARNING: command failed \"$@\"" | |
| fi | |
| } | |
| function end { | |
| # disable tracing | |
| echo 2>/dev/null | |
| echo "Ending tracing..." 2>/dev/null | |
| cd $tracing | |
| warn "echo 0 > events/block/$b_start/enable" | |
| warn "echo 0 > events/block/block_rq_complete/enable" | |
| if (( opt_device || opt_iotype )); then | |
| warn "echo 0 > events/block/$b_start/filter" | |
| warn "echo 0 > events/block/block_rq_complete/filter" | |
| fi | |
| warn "echo > trace" | |
| (( wroteflock )) && warn "rm $flock" | |
| } | |
| function die { | |
| echo >&2 "$@" | |
| exit 1 | |
| } | |
| function edie { | |
| # die with a quiet end() | |
| echo >&2 "$@" | |
| exec >/dev/null 2>&1 | |
| end | |
| exit 1 | |
| } | |
| ### process options | |
| while getopts d:hi:QT opt | |
| do | |
| case $opt in | |
| d) opt_device=1; device=$OPTARG ;; | |
| i) opt_iotype=1; iotype=$OPTARG ;; | |
| Q) opt_queue=1 ;; | |
| T) opt_timestamp=1 ;; | |
| h|?) usage ;; | |
| esac | |
| done | |
| shift $(( $OPTIND - 1 )) | |
| if (( $# )); then | |
| opt_interval=1 | |
| interval=$1 | |
| shift | |
| fi | |
| if (( $# )); then | |
| opt_count=1 | |
| count=$1 | |
| fi | |
| if (( opt_device )); then | |
| major=${device%,*} | |
| minor=${device#*,} | |
| dev=$(( (major << 20) + minor )) | |
| fi | |
| if (( opt_queue )); then | |
| b_start=block_rq_insert | |
| else | |
| b_start=block_rq_issue | |
| fi | |
| ### select awk | |
| [[ -x /usr/bin/mawk ]] && awk='mawk -W interactive' || awk=awk | |
| ### check permissions | |
| cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? | |
| debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" | |
| ### ftrace lock | |
| [[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock" | |
| echo $$ > $flock || die "ERROR: unable to write $flock." | |
| wroteflock=1 | |
| ### setup and begin tracing | |
| warn "echo nop > current_tracer" | |
| warn "echo $bufsize_kb > buffer_size_kb" | |
| filter= | |
| if (( opt_iotype )); then | |
| filter="rwbs ~ \"$iotype\"" | |
| fi | |
| if (( opt_device )); then | |
| [[ "$filter" != "" ]] && filter="$filter && " | |
| filter="${filter}dev == $dev" | |
| fi | |
| if (( opt_iotype || opt_device )); then | |
| if ! echo "$filter" > events/block/$b_start/filter || \ | |
| ! echo "$filter" > events/block/block_rq_complete/filter | |
| then | |
| edie "ERROR: setting -d or -t filter. Exiting." | |
| fi | |
| fi | |
| if ! echo 1 > events/block/$b_start/enable || \ | |
| ! echo 1 > events/block/block_rq_complete/enable; then | |
| edie "ERROR: enabling block I/O tracepoints. Exiting." | |
| fi | |
| etext= | |
| (( !opt_count )) && etext=" Ctrl-C to end." | |
| echo "Tracing block I/O. Output every $interval seconds.$etext" | |
| # | |
| # Determine output format. It may be one of the following (newest first): | |
| # TASK-PID CPU# |||| TIMESTAMP FUNCTION | |
| # TASK-PID CPU# TIMESTAMP FUNCTION | |
| # To differentiate between them, the number of header fields is counted, | |
| # and an offset set, to skip the extra column when needed. | |
| # | |
| offset=$($awk 'BEGIN { o = 0; } | |
| $1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; } | |
| $2 ~ /TASK/ { print o; exit }' trace) | |
| ### print trace buffer | |
| warn "echo > trace" | |
| i=0 | |
| while (( !opt_count || (i < count) )); do | |
| (( i++ )) | |
| sleep $interval | |
| # snapshots were added in 3.10 | |
| if [[ -x snapshot ]]; then | |
| echo 1 > snapshot | |
| echo > trace | |
| cat snapshot | |
| else | |
| cat trace | |
| echo > trace | |
| fi | |
| (( opt_timestamp )) && printf "time %(%H:%M:%S)T:\n" -1 | |
| echo "tick" | |
| done | \ | |
| $awk -v o=$offset -v opt_timestamp=$opt_timestamp -v b_start=$b_start ' | |
| function star(sval, smax, swidth) { | |
| stars = "" | |
| if (smax == 0) return "" | |
| for (si = 0; si < (swidth * sval / smax); si++) { | |
| stars = stars "#" | |
| } | |
| return stars | |
| } | |
| BEGIN { max_i = 0 } | |
| # common fields | |
| $1 != "#" { | |
| time = $(3+o); sub(":", "", time) | |
| dev = $(5+o) | |
| } | |
| # block I/O request | |
| $1 != "#" && $0 ~ b_start { | |
| # | |
| # example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar] | |
| # The cmd field "()" might contain multiple words (hex), | |
| # hence stepping from the right (NF-3). | |
| # | |
| loc = $(NF-3) | |
| starts[dev, loc] = time | |
| next | |
| } | |
| # block I/O completion | |
| $1 != "#" && $0 ~ /rq_complete/ { | |
| # | |
| # example: (fields1..4+o) 202,1 W () 12862256 + 8 [0] | |
| # | |
| dir = $(6+o) | |
| loc = $(NF-3) | |
| if (starts[dev, loc] > 0) { | |
| latency_ms = 1000 * (time - starts[dev, loc]) | |
| i = 0 | |
| for (ms = 1; latency_ms > ms; ms *= 2) { i++ } | |
| hist[i]++ | |
| if (i > max_i) | |
| max_i = i | |
| delete starts[dev, loc] | |
| } | |
| next | |
| } | |
| # timestamp | |
| $1 == "time" { | |
| lasttime = $2 | |
| } | |
| # print summary | |
| $1 == "tick" { | |
| print "" | |
| if (opt_timestamp) | |
| print lasttime | |
| # find max value | |
| max_v = 0 | |
| for (i = 0; i <= max_i; i++) { | |
| if (hist[i] > max_v) | |
| max_v = hist[i] | |
| } | |
| # print histogram | |
| printf "%8s .. %-8s: %-8s |%-38s|\n", ">=(ms)", "<(ms)", | |
| "I/O", "Distribution" | |
| ms = 1 | |
| from = 0 | |
| for (i = 0; i <= max_i; i++) { | |
| printf "%8d -> %-8d: %-8d |%-38s|\n", from, ms, | |
| hist[i], star(hist[i], max_v, 38) | |
| from = ms | |
| ms *= 2 | |
| } | |
| fflush() | |
| delete hist | |
| delete starts # invalid if events missed between snapshots | |
| max_i = 0 | |
| } | |
| $0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" } | |
| ' | |
| ### end tracing | |
| end |