/
chk_oerr.sh
285 lines (285 loc) · 12.8 KB
/
chk_oerr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#!/bin/ksh -p
#============================================================================
# File: chk_oerr.sh
# Type: UNIX korn-shell script
# Author: Austin Hackett
# Date: 23Feb12
#
# Description:
#
# Check each ADR Home for alerts, incidents, and problems. We maintain
# a record of the current time when the script runs in order to
# implement a moving window that ensures we'll never miss an issue.
#
# Much of this script is adapted and from Tim Gorman's UNIX shell
# scripts library at http://www.evdbt.com/tools.htm
#
# The idea of using adrci for this task was from a blog post by
# Coskan Gundogar:
#
# http://coskan.wordpress.com/?s=adrci
#
# Exit statuses:
# 0 normal succesful completion
# 1 An error occurred
# 2 One or more alerts, incidents, or problems were found
#
# Modifications:
#============================================================================
Pgm=chk_oerr
SendEmail=0
ReturnCode=0
#
#----------------------------------------------------------------------------
# Set the correct PATH for the script...
#----------------------------------------------------------------------------
PATH=/usr/bin:/usr/local/bin; export PATH
#
#----------------------------------------------------------------------------
# Korn-shell function to send email notifications...
#----------------------------------------------------------------------------
notify_via_email() # ...use email to notify people...
{
cat << __EOF__ | mailx -s "$Pgm $(uname -n)" dba@mycompany.com
$ErrMsg
__EOF__
} # ...end of shell function "notify_via_email"...
#
#----------------------------------------------------------------------------
# Korn-shell function to send email notifications...
#----------------------------------------------------------------------------
check_adr_home() # ...check ADR home for alerts, problems, and incidents
{
#
#--------------------------------------------------------------------
# In the ADR, alert creation times etc. are stored as systimestamps.
# This means that we need to know the timzone used for a given ADR
# home. Since the last entry of the alert log will contain this info
# we can use it to extract the timezone. However, if the ADR was
# purged recently, then a tail of the alert log will raise an
# ADR-48156 "Alert log purge has occurred" in which case we'll skip
# this home because there is no alert log to check for issues...
#--------------------------------------------------------------------
Line=$(adrci exec="set home $AdrHome; show alert -tail 1" | head -1)
echo $Line | grep "DIA-48156" > /dev/null
(( $? == 0 )) && return
TimeZone=$(echo $Line | awk '{print $3}')
echo $TimeZone | grep "\+[0-9][0-9]:[0-9][0-9]" > /dev/null
if (( $? == 0 ))
then
#------------------------------------------------------------
# Obtain a list of alerts, incidents, and problems that
# have occurred since the script last ran...
#------------------------------------------------------------
if [[ $HomeType = "tnslsnr" ]]
then
adrci exec="set home $AdrHome; show alert -p \\\"message_text like '%TNS-%' and originating_timestamp >= '$LastTime.000000 $TimeZone' and originating_timestamp < '$CurrTime.000000 $TimeZone'\\\"" -term > $TempFile
else
adrci exec="set home $AdrHome; show alert -p \\\"(message_text like '%ORA-%' or message_text like '%CORRUPT%') and originating_timestamp >= '$LastTime.000000 $TimeZone' and originating_timestamp < '$CurrTime.000000 $TimeZone'\\\"" -term > $TempFile
fi
LineCount=$(cat $TempFile | sed '/^$/d' | egrep -v "^ADR Home =|^\*" | wc -l)
if (( $LineCount > 0 ))
then
echo "" >> $ReportFile
echo "Alerts for ADR Home $AdrHome" >> $ReportFile
echo "*************************************************************************" >> $ReportFile
cat $TempFile >> $ReportFile
SendEmail=1
fi
adrci exec="set home $AdrHome; show incident -p \\\"create_time >= '$LastTime.000000 $TimeZone' and create_time < '$CurrTime.000000 $TimeZone'\\\"" > $TempFile
LineCount=$(cat $TempFile | sed '/^$/d' | egrep -v "^ADR Home =|^\*|^0 rows fetched" | wc -l)
if (( $LineCount > 0 ))
then
echo "" >> $ReportFile
echo "Incidents for ADR Home $AdrHome" >> $ReportFile
echo "*************************************************************************" >> $ReportFile
cat $TempFile >> $ReportFile
SendEmail=1
fi
adrci exec="set home $AdrHome; show problem -p \\\"lastinc_time >= '$LastTime.000000 $TimeZone' and lastinc_time < '$CurrTime.000000 $TimeZone'\\\"" > $TempFile
LineCount=$(cat $TempFile | sed '/^$/d' | egrep -v "^ADR Home =|^\*|^0 rows fetched" | wc -l)
if (( $LineCount > 0 ))
then
echo "" >> $ReportFile
echo "Problems for ADR Home $AdrHome" >> $ReportFile
echo "*************************************************************************" >> $ReportFile
cat $TempFile >> $ReportFile
SendEmail=1
fi
else
ErrMsg="Unable to determine timezone for ADR HOME \"$AdrHome\" (Line=\"$Line\")"
notify_via_email
fi
} #...end of shell function "check_adr_home"...
#
#----------------------------------------------------------------------------
# Verify that an ORACLE_HOME has been specified on the UNIX command-line...
#----------------------------------------------------------------------------
if (( $# != 1 ))
then
echo "Usage: $Pgm.sh ORACLE_HOME; aborting..."
exit 1
fi
OraHome=$1
#
#----------------------------------------------------------------------------
# Verify that the specified ORACLE_HOME exists...
#----------------------------------------------------------------------------
if [ ! -d $OraHome ]
then
echo "Directory \"$OraHome\" not found; aborting..."
exit 1
fi
#
#----------------------------------------------------------------------------
# Set the Oracle environment variables for this Oracle home...
#----------------------------------------------------------------------------
export ORACLE_HOME=$OraHome
export PATH=$ORACLE_HOME/bin:$PATH
#
#----------------------------------------------------------------------------
# Check that adrci brinary is present...
#----------------------------------------------------------------------------
if [ ! -x $ORACLE_HOME/bin/adrci ]
then
echo "adrci binary does not exist or is not executable; aborting.."
fi
#
#----------------------------------------------------------------------------
# Locate the script's log file directory; if it doesn't exist, then create
# it...
#----------------------------------------------------------------------------
LogDir=/var/tmp/$Pgm
if [ ! -d $LogDir ]
then
mkdir -p $LogDir
if (( $? != 0 ))
then
echo "Could not create Logging directory \"$LogDir\"; aborting..."
exit 1
fi
fi
#
#----------------------------------------------------------------------------
# Locate the script's "log" file; if it doesn't exit, then initialize it.
# If it already exits but has grown too large (i.e. over 100 lines), then
# trim it by re-initializing it...
#
# The last line of the "log" file contains "contextual" information for this
# script to use, namely the last time the script ran...
#----------------------------------------------------------------------------
Log=$LogDir/${Pgm}_state.log
if [ -r $Log ]
then
NbrLines=$(wc -l $Log | awk '{print $1}')
if (( $NbrLines >= 100 ))
then
Line=$(tail -1 $Log)
echo "# file re-initialized on \"$(date)\"" > $Log
echo "# PLEASE DO NOT edit this file" >> $Log
echo $Line >> $Log
fi
else
echo "# file initialized on \"$(date)\"" > $Log
echo "# PLEASE DO NOT edit this file" >> $Log
echo "1970-01-01 00:00:00" >> $Log
chmod 640 $Log
fi
#
#----------------------------------------------------------------------------
# Extract the last time the script ran from the last line of the "log"
# file...
#----------------------------------------------------------------------------
LastTime=$(tail -1 $Log)
#
#----------------------------------------------------------------------------
# Make sure the date is in the expected format - nobody manually edited the
# "log" file...
#----------------------------------------------------------------------------
echo $LastTime | grep "^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]$" > /dev/null
if (( $? != 0 ))
then
echo "Last run date in \"log\" file (\"$LastTime\") not in expected format; aborting..."
exit 1
fi
#
#----------------------------------------------------------------------------
# Get the current time. This is what will be appended to the "log" file at
# the end of this script and therefore becomes the last time the script
# ran. This will be picked up on the next execution. In this way we maintain
# a moving window of data that ensures no adr events are ever missed...
#----------------------------------------------------------------------------
CurrTime=$(date '+%Y-%m-%d %H:%M:%S')
#
#----------------------------------------------------------------------------
# Trim the "log" file to only the most recent 10 lines...
#----------------------------------------------------------------------------
TrimLogFile=$(tail -10 $Log)
echo "$TrimLogFile" > $Log
#
#----------------------------------------------------------------------------
# Locate the script's "report" file which forms the email body if any
# alerts, incidents, and problems are found. Also, the "temp" file
# which is used to capture a list of alerts, incidents, and problems for
# an ADR home. If any are found, these are appended to the report file...
#----------------------------------------------------------------------------
ReportFile=$LogDir/report.txt
TempFile=/tmp/${Pgm}$(echo $ORACLE_HOME | tr '/' '_').out
#
#----------------------------------------------------------------------------
# Add a report title to the "report" file...
#----------------------------------------------------------------------------
echo "Oracle Error Report For Period $LastTime - $CurrTime" > $ReportFile
echo "*************************************************************************" >> $ReportFile
#
#----------------------------------------------------------------------------
# Gather all database and ASM homes in the ADR for this ORACLE_HOME into an
# array...
#----------------------------------------------------------------------------
AdrHomes=$(adrci exec="show homes" | egrep "rdbms|asm")
HomeType=rdbms_or_asm
#
#----------------------------------------------------------------------------
# Check the ADR home for alerts, incidents, and problems...
#----------------------------------------------------------------------------
for AdrHome in ${AdrHomes[@]}
do
check_adr_home
done
#
#----------------------------------------------------------------------------
# Gather all database and ASM homes in the ADR for this ORACLE_HOME into an
# array...
#----------------------------------------------------------------------------
AdrHomes=$(adrci exec="show homes" | grep tnslsnr)
HomeType=tnslsnr
#
#----------------------------------------------------------------------------
# Check the ADR home for alerts, incidents, and problems...
#----------------------------------------------------------------------------
for AdrHome in ${AdrHomes[@]}
do
check_adr_home
done
#
#----------------------------------------------------------------------------
# Log new starting values to the "log" file for the next time this script is
# executed...
#----------------------------------------------------------------------------
echo "$CurrTime" >> $Log
#
#----------------------------------------------------------------------------
# Send an email notification as required...
#----------------------------------------------------------------------------
if (( $SendEmail == 1 ))
then
ErrMsg="$(cat $ReportFile)"
notify_via_email
ReturnCode=2
fi
#
#----------------------------------------------------------------------------
# Remove the "temp" file and exit...
#----------------------------------------------------------------------------
rm -f $TempFile
exit $ReturnCode