Permalink
Find file
7018394 Feb 16, 2017
@aswen @akomakom @ssams @markruys @kisst @erude1 @edyan
executable file 333 lines (301 sloc) 14.6 KB
#!/bin/sh
# Nagios plugin to monitor Puppet agent state
#
# Copyright (c) 2011 Alexander Swen <a@swen.nu>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# Example configuration
#
# Typical this check is placed on a client and runs via nrpe.
# So add this to nrpe.cfg:
# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet
# or if you want to specify options (rather than have the script calculate key values and facts) then something like
# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w 3600 -c 7200 -s /var/lib/puppet/state/last_run_summary.yaml -d 0
# This should warn when the agent hasnt run for an hour and go critical after two hours
# if you have dont_blame_nrpe=1 set you can choose to
# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w $ARG1$ -c $ARG2$ -s $ARG3$ -d $ARG4$
#
# define service {
# use generic-service
# service_description Puppet agent
# check_command check_nrpe!check_puppet_agent
# or
# check_command check_nrpe!check_puppet_agent!3600!7200
#}
#
# Sudo required.
# The user running this script must be allowed using sudo to run puppet config print, e.g. in /etc/sudoers include the 3 lines
# User_Alias NAGIOS=nagios
# Cmnd_Alias PUPPETCHECK=/usr/bin/puppet config print --section agent runinterval,\
# /usr/bin/puppet config print --section agent splay,\
# /usr/bin/puppet config print --section agent splaylimit,\
# /usr/bin/puppet config print --section agent agent_disabled_lockfile,\
# /usr/bin/puppet config print --section agent lastrunfile,\
# /usr/bin/puppet config print --section agent lastrunreport,\
# /usr/bin/puppet config print --section agent pidfile
# NAGIOS ALL=NOPASSWD:PUPPETCHECK
#
# CHANGELOG:
# 20120126 A.Swen created.
# 20120214 trey85stang Modified, added getopts, usage, defaults.
# 20120220 A.Swen lastrunfile can be overriden.
# 20130717 A.Swen Moved finding lastrunfile to after getopts and made it conditional to param -s.
# Added option to tell script if puppet agent is started from cron or as a daemon (-d).
# Switched to use awk to filter values from lastrunfile and set them as params.
# Updated some comments.
# Removed bug in search for process (that would previously always find something because grep find it's processline).
# "puppet agent --configprint lastrunfile" has to be run as root. As normal user it yields ~/.puppet/var/state.
# Based on feedback Михайло Масик updated:
# - Puppet --configprint => puppet agent --configprint (version 3 has new way of printing config).
# - Added new pattern to search for process.
# - Added test kill -0 to see if process is still there.
# 20130725 A.Swen Based on feedback Михайло Масик updated a test (removed ! from test).
# 20130725 A.Swen Added sudo to puppet config print pidfile.
# 20131209 Mark Ruys Issue warning when last_run_report.yaml contain errors.
# 20141015 A.Swen Add show disabled status.
# 20141127 KissT Remove requirement to have sudo custom rule.
# 20150917 A.Swen Based on an idea of Daniel Lawrence check for major version to decide how to print config.
# Based on idea of D.Stirling switched to sh.
# Findout puppet executable location using which.
# Based on an idea of D.Stirling updated daemon check.
# Based on an idea of D.Stirling made BSD compattible.
# Based on an idea of BTriller fix the getopts command to parse the agent_disabled_lockfile option.
# 20151201 Akomakom Add perf data option.
# More reliable yaml parsing.
# If $HOME not set: set it.
# Fix PS command for Suse.
# 20151218 K.A. Gillow Calculate warn/crit based on runinterval and splay setting rather than use fixed settings.
# Check system has been up longer than crit/warn time otherwise don't yet trigger normally relevant fault levels.
# We never generally want puppet disabled so change to warning.
# 20151229 A.Swen Fix bug in PERF_DATA (replace compset by set).
# Prettify $PERF_DATA output.
# 20160201 S. Sams Changes to PERF_DATA output format to increase compatibility with Nagios Plugin guidelines.
# Add compatibility with Puppet 4.x
# 20160315 J. Yaworski Add -v, allowing to pass a version to compare
# 20160815 L. Buriola Add -E to show first error on output
# FUNCTIONS
result () {
case $1 in
0) echo "OK: Puppet agent $version running catalogversion $config, and executed at $last_run_human for last time. $PERF_DATA";rc=0 ;;
1) echo "UNKNOWN: last_run_summary.yaml not found, not readable or incomplete";rc=3 ;;
2) echo "WARNING: Last run was $time_since_last seconds ago. Warn is $WARN. $PERF_DATA";rc=1 ;;
3) echo "CRITICAL: Last run was $time_since_last seconds ago. Crit is $CRIT. $PERF_DATA";rc=2 ;;
4) echo "CRITICAL: Puppet daemon not running or something wrong with process";rc=2 ;;
5) echo "UNKNOWN: no WARN or CRIT parameters were sent to this check";rc=3 ;;
6) echo "CRITICAL: Last run had 1 or more errors. Check the logs. $FIRST_ERROR $PERF_DATA";rc=2 ;;
7) echo "DISABLED: Reason: $(sed -e 's/{"disabled_message":"//' -e 's/"}//' $agent_disabled_lockfile). $PERF_DATA";rc=1 ;;
8) echo "UNKNOWN: No Puppet executable found";rc=3 ;;
9) echo "UNKNOWN: Internal error: $2"; rc=3 ;;
10) echo "OK (PROBABLY): Puppet agent last successful run $last_run_human (runinterval $runinterval, splay $splay, splaylimit $splay limit) but system has not been up long enough to guarantee a fresh puppet run should have occurred";rc=0 ;;
11) echo "INFO: Puppet agent is version $version, but should be $wanted_version. $PERF_DATA";rc=0 ;;
12) echo "UNKNOWN: last_run_report.yaml not found, not readable or incomplete";rc=3 ;;
esac
exit $rc
}
usage () {
echo ""
echo "USAGE: "
echo " $0 [-c 7200] [-w 3600] [-d 0] [-l agent_disabled_lockfile] [-s lastrunfile] [-r lastrunreport] [-v wanted_version] [-PEh]"
echo " -c Critical threshold (default 7200 seconds)"
echo " -w Warning threshold (default 3600 seconds)"
echo " -d 0|1: puppet agent should be a daemon(1) or not (0).(default 1)"
echo " -h Show this help."
echo " -l Agent_disabled_lockfile (default: /var/lib/puppet/state/agent_disabled.lock)"
echo " -s Lastrunfile (default: /var/lib/puppet/state/last_run_summary.yaml)"
echo " -r Lastrunreport (default: /var/lib/puppet/state/last_run_report.yaml)"
echo " -P Enable perf_data in the output"
echo " -E Show first error in the output"
echo " -v The version of puppet that should be running"
echo ""
exit 1
}
# Get a flat representation of yaml without relying on external tools.
parse_yaml () {
local prefix=$2
local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
sed -ne "s|^\($s\):|\1|" \
-e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
-e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
awk -F$fs '{
indent = length($1)/2;
vname[indent] = $2;
for (i in vname) {if (i > indent) {delete vname[i]}}
if (length($3) > 0) {
vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
}
}'
}
# Get first error from last_run_report.yaml
get_first_error() {
grep_cmd="/bin/grep -B 3 -A 1"
first_error_time=$($grep_cmd "status: failure" $lastrunreport | grep "time: " | sort -n | head -1)
first_error=$($grep_cmd "$first_error_time" $lastrunreport | grep "message: " | sed 's/.*message: //' | head -1)
echo "FIRST_ERROR ($first_error)"
}
# SCRIPT
while getopts "c:d:l:s:r:w:v:PEh" opt; do
case $opt in
c)
if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
then
CRIT=$OPTARG
else
usage
fi
;;
d)
# argument should be 0 or 1
if [ $OPTARG -eq 0 -o $OPTARG -eq 1 ];then
daemonized=$OPTARG
else
usage
fi
;;
h) usage ;;
l) agent_disabled_lockfile=$OPTARG ;;
s) lastrunfile=$OPTARG ;;
r) lastrunreport=$OPTARG ;;
w)
if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
then
WARN=$OPTARG
else
usage
fi
;;
P)
PERF=true
;;
E)
SHOW_ERROR=true
;;
v)
wanted_version=$OPTARG
;;
*)
usage
;;
esac
done
[ -z "$HOME" ] && export HOME=$(getent passwd `whoami` | cut -d: -f6) # Some clean environment situations make puppet -V fail.
# Ensure installation directory of puppet 4 is included in PATH
PATH="$PATH:/opt/puppetlabs/bin"
# Find location of puppet executable.
PUPPET=$(which puppet) || result 8
# Find out Puppet major version to determine configprint syntax.
puppet_major_version=$($PUPPET -V|cut -d. -f1)
[ -z "$puppet_major_version" ] && result 9 "Puppet version unknown from $($PUPPET -V 2>&1)"
# Set Puppet configprint syntax.
if [ $puppet_major_version -ge 3 ];then
puppet_config_print="sudo $PUPPET config print --section agent"
else
puppet_config_print="sudo $PUPPET --configprint"
fi
# construct WARN and CRIT times based on runinterval plus a safety buffer
# if they have not already been explicitly set
runinterval=$($puppet_config_print runinterval)
splaylimit=0
splay=$($puppet_config_print splay)
[ "$splay" != "false" ] && splaylimit=$($puppet_config_print splaylimit)
[ -z "$WARN" ] && WARN=$(($runinterval + $splaylimit))
[ -z "$CRIT" ] && CRIT=$(($WARN + $runinterval))
#now check we finally have some sensible settings
[ -z "$WARN" -o $WARN -lt 30 ] && result 5
[ -z "$CRIT" -o $CRIT -lt 60 ] && result 5
# If the disabled lockfile is not given as a param try to find it ourselves.
[ -z "$agent_disabled_lockfile" ] && agent_disabled_lockfile=$($puppet_config_print agent_disabled_lockfile)
# If there's a disabled.lock file don't look any further.
[ -f "$agent_disabled_lockfile" ] && result 7
# If the lastrunfile is not given as a param try to find it ourselves.
[ -z "$lastrunfile" ] && lastrunfile=$($puppet_config_print lastrunfile)
# Check if state file exists.
[ -s $lastrunfile -a -r $lastrunfile ] || result 1
# If the lastrunreport is not given as a param try to find it ourselves.
[ -z "$lastrunreport" ] && lastrunreport=$($puppet_config_print lastrunreport)
# Check if state file exists.
[ -n "$SHOW_ERROR" ] && ( [ -s $lastrunreport -a -r $lastrunreport ] || result 12 )
# Check if daemonized was set, else set default to 1.
[ -n "$daemonized" ] || daemonized=1
# If Puppet agent runs as a daemon there should be a process. We can't check so much when it is triggered by cron.
if [ $daemonized -eq 1 ];then
# Puppet version 4 changed several paths, determine correct ones
if [ $puppet_major_version -ge 4 ];then
puppet_daemon_rundir="puppetlabs"
puppet_daemon_regex="/opt/puppetlabs/puppet/bin/ruby /opt/puppetlabs/puppet/bin/puppet"
else
puppet_daemon_rundir="puppet"
puppet_daemon_regex="/usr(/local)?/bin/ruby[^ ]* /usr(/local)?/s?bin/puppetd?"
fi
# Check puppet daemon:
[ "$(ps axfww|egrep "$puppet_daemon_regex"|grep -v egrep)" ] || result 4
uname -a|grep -q BSD && default_pidfile=/var/$puppet_daemon_rundir/run/agent.pid || default_pidfile=/var/run/$puppet_daemon_rundir/agent.pid
[ -e $default_pidfile ] && pidfile=$default_pidfile || pidfile=$($puppet_config_print pidfile)
# If there is a pidfile tell me the pid, else fail.
[ -f $pidfile ]&&pid=$(cat $pidfile)||result 4
# See if the process is running.
ps -p $pid > /dev/null || result 4
# On Linux test if the pid we found in the pidfile is puppet:
if uname -a|grep -q Linux;then
grep -q puppet /proc/$pid/cmdline ||result 4
fi
fi
# parse last run file
# puppet version 4 files have less intendation, add prefix to match parsed variables from older versions
[ $puppet_major_version -ge 4 ] && yaml_prefix="_"
eval $(parse_yaml $lastrunfile $yaml_prefix)
# this flattens the hierarchy to single-level name/value variables, eg:
# _events_total="14"
# _version_config="1448907293"
# Construct perf data using anything that starts with "_resources_ or _time_total"
if [ -n "$PERF" ] ; then
for V in $(set | grep "^_resources_\|^_time_total") ; do
PERF_DATA="$(echo $V | sed 's/^_//' | sed "s/='/=/" | sed "s/'$//") $PERF_DATA"
done
PERF_DATA="| $PERF_DATA"
fi
# Construct FIRST_ERROR using last_run_report.yaml
if [ -n "$SHOW_ERROR" ] ; then
FIRST_ERROR=$(get_first_error)
fi
# Check when last run happened.
last_run=$_time_last_run
last_run_human=$(date -d @$last_run +%c)
now=$(date +%s)
# Check how long system been up in seconds
uptime=$(cut -f1 -d' ' /proc/uptime | cut -f1 -d.)
# Assess last run time relative to warn/crit values and system uptime.
time_since_last=$((now-last_run))
[ $time_since_last -ge $CRIT -a $uptime -ge $CRIT ] && result 3
[ $time_since_last -ge $CRIT -a $uptime -lt $CRIT ] && result 10
[ $time_since_last -ge $WARN -a $uptime -ge $WARN ] && result 2
[ $time_since_last -ge $WARN -a $uptime -lt $WARN ] && result 10
# Get some more info from the yaml file.
config=$_version_config
version=$_version_puppet
failed=$_resources_failed
failure=$_events_failure
failed_to_restart=$_resources_failed_to_restart
# If any of the values above doesn't return raise an error.
[ -z "$last_run" -o -z "$config" -o -z "$version" -o -z "$failed" -o -z "$failure" -o -z "$failed_to_restart" ] && result 1
# If anything went wrong last run => crit.
[ $failed -gt 0 -o $failure -gt 0 -o $failed_to_restart -gt 0 ] && result 6
# If $wanted_version is set, compare it to the running version
if [ -n "$wanted_version" -a -n "$version" ]; then
[ "$wanted_version" != "$version" ] && result 11
fi
# If we reached here all is ok.
result 0
# END