Permalink
#!/bin/bash | |
# | |
# iosnoop - trace block device I/O. | |
# Written using Linux ftrace. | |
# | |
# This traces disk I/O at the block device interface, using the block: | |
# tracepoints. This can help characterize the I/O requested for the storage | |
# devices and their resulting performance. I/O completions can also be studied | |
# event-by-event for debugging disk and controller I/O scheduling issues. | |
# | |
# USAGE: ./iosnoop [-hQst] [-d device] [-i iotype] [-p pid] [-n name] [duration] | |
# | |
# Run "iosnoop -h" for full usage. | |
# | |
# REQUIREMENTS: FTRACE CONFIG, block:block_rq_* tracepoints (you may | |
# already have these on recent kernels). | |
# | |
# OVERHEAD: By default, iosnoop works without buffering, printing I/O events | |
# as they happen (uses trace_pipe), context switching and consuming CPU to do | |
# so. This has a limit of about 10,000 IOPS (depending on your platform), at | |
# which point iosnoop will be consuming 1 CPU. The duration mode uses buffering, | |
# and can handle much higher IOPS rates, however, the buffer has a limit of | |
# about 50,000 I/O, after which events will be dropped. You can tune this with | |
# bufsize_kb, which is per-CPU. Also note that the "-n" option is currently | |
# post-filtered, so all events are traced. | |
# | |
# This was written as a proof of concept for ftrace. It would be better written | |
# using perf_events (after some capabilities are added), which has a better | |
# buffering policy, or a tracer such as SystemTap or ktap. | |
# | |
# From perf-tools: https://github.com/brendangregg/perf-tools | |
# | |
# See the iosnoop(8) man page (in perf-tools) for more info. | |
# | |
# COPYRIGHT: Copyright (c) 2014 Brendan Gregg. | |
# | |
# This program is free software; you can redistribute it and/or | |
# modify it under the terms of the GNU General Public License | |
# as published by the Free Software Foundation; either version 2 | |
# of the License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software Foundation, | |
# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
# | |
# (http://www.gnu.org/copyleft/gpl.html) | |
# | |
# 12-Jul-2014 Brendan Gregg Created this. | |
### default variables | |
tracing=/sys/kernel/debug/tracing | |
flock=/var/tmp/.ftrace-lock | |
bufsize_kb=4096 | |
opt_duration=0; duration=; opt_name=0; name=; opt_pid=0; pid=; ftext= | |
opt_start=0; opt_end=0; opt_device=0; device=; opt_iotype=0; iotype= | |
opt_queue=0 | |
trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section | |
function usage { | |
cat <<-END >&2 | |
USAGE: iosnoop [-hQst] [-d device] [-i iotype] [-p PID] [-n name] | |
[duration] | |
-d device # device string (eg, "202,1) | |
-i iotype # match type (eg, '*R*' for all reads) | |
-n name # process name to match on I/O issue | |
-p PID # PID to match on I/O issue | |
-Q # use queue insert as start time | |
-s # include start time of I/O (s) | |
-t # include completion time of I/O (s) | |
-h # this usage message | |
duration # duration seconds, and use buffers | |
eg, | |
iosnoop # watch block I/O live (unbuffered) | |
iosnoop 1 # trace 1 sec (buffered) | |
iosnoop -Q # include queueing time in LATms | |
iosnoop -ts # include start and end timestamps | |
iosnoop -i '*R*' # trace reads | |
iosnoop -p 91 # show I/O issued when PID 91 is on-CPU | |
iosnoop -Qp 91 # show I/O queued by PID 91, queue time | |
See the man page and example file for more info. | |
END | |
exit | |
} | |
function warn { | |
if ! eval "$@"; then | |
echo >&2 "WARNING: command failed \"$@\"" | |
fi | |
} | |
function end { | |
# disable tracing | |
echo 2>/dev/null | |
echo "Ending tracing..." 2>/dev/null | |
cd $tracing | |
warn "echo 0 > events/block/$b_start/enable" | |
warn "echo 0 > events/block/block_rq_complete/enable" | |
if (( opt_device || opt_iotype || opt_pid )); then | |
warn "echo 0 > events/block/$b_start/filter" | |
warn "echo 0 > events/block/block_rq_complete/filter" | |
fi | |
warn "echo > trace" | |
(( wroteflock )) && warn "rm $flock" | |
} | |
function die { | |
echo >&2 "$@" | |
exit 1 | |
} | |
function edie { | |
# die with a quiet end() | |
echo >&2 "$@" | |
exec >/dev/null 2>&1 | |
end | |
exit 1 | |
} | |
### process options | |
while getopts d:hi:n:p:Qst opt | |
do | |
case $opt in | |
d) opt_device=1; device=$OPTARG ;; | |
i) opt_iotype=1; iotype=$OPTARG ;; | |
n) opt_name=1; name=$OPTARG ;; | |
p) opt_pid=1; pid=$OPTARG ;; | |
Q) opt_queue=1 ;; | |
s) opt_start=1 ;; | |
t) opt_end=1 ;; | |
h|?) usage ;; | |
esac | |
done | |
shift $(( $OPTIND - 1 )) | |
if (( $# )); then | |
opt_duration=1 | |
duration=$1 | |
shift | |
fi | |
if (( opt_device )); then | |
major=${device%,*} | |
minor=${device#*,} | |
dev=$(( (major << 20) + minor )) | |
fi | |
### option logic | |
(( opt_pid && opt_name )) && die "ERROR: use either -p or -n." | |
(( opt_pid )) && ftext=" issued by PID $pid" | |
(( opt_name )) && ftext=" issued by process name \"$name\"" | |
if (( opt_duration )); then | |
echo "Tracing block I/O$ftext for $duration seconds (buffered)..." | |
else | |
echo "Tracing block I/O$ftext. Ctrl-C to end." | |
fi | |
if (( opt_queue )); then | |
b_start=block_rq_insert | |
else | |
b_start=block_rq_issue | |
fi | |
### select awk | |
(( opt_duration )) && use=mawk || use=gawk # workaround for mawk fflush() | |
[[ -x /usr/bin/$use ]] && awk=$use || awk=awk | |
wroteflock=1 | |
### check permissions | |
cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? | |
debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" | |
### ftrace lock | |
[[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock" | |
echo $$ > $flock || die "ERROR: unable to write $flock." | |
### setup and begin tracing | |
echo nop > current_tracer | |
warn "echo $bufsize_kb > buffer_size_kb" | |
filter= | |
if (( opt_iotype )); then | |
filter="rwbs ~ \"$iotype\"" | |
fi | |
if (( opt_device )); then | |
[[ "$filter" != "" ]] && filter="$filter && " | |
filter="${filter}dev == $dev" | |
fi | |
filter_i=$filter | |
if (( opt_pid )); then | |
[[ "$filter_i" != "" ]] && filter_i="$filter_i && " | |
filter_i="${filter_i}common_pid == $pid" | |
[[ "$filter" == "" ]] && filter=0 | |
fi | |
if (( opt_iotype || opt_device || opt_pid )); then | |
if ! echo "$filter_i" > events/block/$b_start/filter || \ | |
! echo "$filter" > events/block/block_rq_complete/filter | |
then | |
edie "ERROR: setting -d or -t filter. Exiting." | |
fi | |
fi | |
if ! echo 1 > events/block/$b_start/enable || \ | |
! echo 1 > events/block/block_rq_complete/enable; then | |
edie "ERROR: enabling block I/O tracepoints. Exiting." | |
fi | |
(( opt_start )) && printf "%-15s " "STARTs" | |
(( opt_end )) && printf "%-15s " "ENDs" | |
printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n" \ | |
"COMM" "PID" "TYPE" "DEV" "BLOCK" "BYTES" "LATms" | |
# | |
# Determine output format. It may be one of the following (newest first): | |
# TASK-PID CPU# |||| TIMESTAMP FUNCTION | |
# TASK-PID CPU# TIMESTAMP FUNCTION | |
# To differentiate between them, the number of header fields is counted, | |
# and an offset set, to skip the extra column when needed. | |
# | |
offset=$($awk 'BEGIN { o = 0; } | |
$1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; } | |
$2 ~ /TASK/ { print o; exit }' trace) | |
### print trace buffer | |
warn "echo > trace" | |
( if (( opt_duration )); then | |
# wait then dump buffer | |
sleep $duration | |
cat trace | |
else | |
# print buffer live | |
cat trace_pipe | |
fi ) | $awk -v o=$offset -v opt_name=$opt_name -v name=$name \ | |
-v opt_duration=$opt_duration -v opt_start=$opt_start -v opt_end=$opt_end \ | |
-v b_start=$b_start ' | |
# common fields | |
$1 != "#" { | |
# task name can contain dashes | |
comm = pid = $1 | |
sub(/-[0-9][0-9]*/, "", comm) | |
sub(/.*-/, "", pid) | |
time = $(3+o); sub(":", "", time) | |
dev = $(5+o) | |
} | |
# block I/O request | |
$1 != "#" && $0 ~ b_start { | |
if (opt_name && match(comm, name) == 0) | |
next | |
# | |
# example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar] | |
# The cmd field "()" might contain multiple words (hex), | |
# hence stepping from the right (NF-3). | |
# | |
loc = $(NF-3) | |
starts[dev, loc] = time | |
comms[dev, loc] = comm | |
pids[dev, loc] = pid | |
next | |
} | |
# block I/O completion | |
$1 != "#" && $0 ~ /rq_complete/ { | |
# | |
# example: (fields1..4+o) 202,1 W () 12862256 + 8 [0] | |
# | |
dir = $(6+o) | |
loc = $(NF-3) | |
nsec = $(NF-1) | |
if (starts[dev, loc] > 0) { | |
latency = sprintf("%.2f", | |
1000 * (time - starts[dev, loc])) | |
comm = comms[dev, loc] | |
pid = pids[dev, loc] | |
if (opt_start) | |
printf "%-15s ", starts[dev, loc] | |
if (opt_end) | |
printf "%-15s ", time | |
printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n", | |
comm, pid, dir, dev, loc, nsec * 512, latency | |
if (!opt_duration) | |
fflush() | |
delete starts[dev, loc] | |
delete comms[dev, loc] | |
delete pids[dev, loc] | |
} | |
next | |
} | |
$0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" } | |
' | |
### end tracing | |
end |