Skip to content
This repository was archived by the owner on Feb 13, 2025. It is now read-only.

Commit 90cc3a9

Browse files
authored
cmd/bosun: close on active results in delayed close
Documentation and fetching the default delayed close period to come in future commit. This also includes commenting existing functions in for notifications and some typescript objects in place of "any"
1 parent 5e92004 commit 90cc3a9

File tree

20 files changed

+5299
-4547
lines changed

20 files changed

+5299
-4547
lines changed

cmd/bosun/conf/notify.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ func init() {
2828
"The number of email notifications that Bosun failed to send.")
2929
}
3030

31+
// Notify triggers Email/HTTP/Print actions for the Notification object
3132
func (n *Notification) Notify(subject, body string, emailsubject, emailbody []byte, c SystemConfProvider, ak string, attachments ...*models.Attachment) {
3233
if len(n.Email) > 0 {
3334
go n.DoEmail(emailsubject, emailbody, c, ak, attachments...)

cmd/bosun/sched/check.go

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,7 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
122122
if err != nil {
123123
return
124124
}
125-
if incident != nil {
126-
rt, err = data.GetRenderedTemplates(incident.Id)
127-
if err != nil {
128-
return
129-
}
130-
}
125+
131126
defer func() {
132127
// save unless incident is new and closed (log alert)
133128
if incident != nil && (incident.Id != 0 || incident.Open) {
@@ -140,6 +135,70 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
140135
}
141136
}
142137
}()
138+
if incident != nil {
139+
rt, err = data.GetRenderedTemplates(incident.Id)
140+
if err != nil {
141+
return
142+
}
143+
for i, action := range incident.Actions {
144+
if action.Type == models.ActionDelayedClose && !(action.Fullfilled || action.Cancelled) {
145+
if event.Status > incident.WorstStatus {
146+
// If the lifetime severity of the incident has increased, cancel the delayed close
147+
err = s.ActionByAlertKey("bosun", "cancelled delayed close due to severity increase", models.ActionCancelClose, nil, ak)
148+
if err != nil {
149+
return
150+
}
151+
incident, err = data.GetIncidentState(incident.Id)
152+
if err != nil {
153+
return
154+
}
155+
// Continue processing alert after cancelling the delayed close
156+
break
157+
}
158+
if action.Deadline == nil {
159+
err = fmt.Errorf("should not be here - cancelled close without deadline")
160+
return
161+
}
162+
if r.Start.Before(*action.Deadline) {
163+
if event.Status == models.StNormal {
164+
slog.Infof("closing alert %v on delayed close because the alert has returned to normal before deadline", incident.AlertKey)
165+
if event.Status != incident.CurrentStatus {
166+
incident.Events = append(incident.Events, *event)
167+
}
168+
incident.CurrentStatus = event.Status
169+
// Action needs to know it is normal, so update the incident that action will read
170+
_, err = data.UpdateIncidentState(incident)
171+
if err != nil {
172+
return
173+
}
174+
err = s.ActionByAlertKey("bosun", fmt.Sprintf("close on behalf of delayed close by %v", action.User), models.ActionClose, nil, ak)
175+
if err != nil {
176+
return
177+
}
178+
incident, err = data.GetIncidentState(incident.Id)
179+
if err != nil {
180+
return
181+
}
182+
incident.Actions[i].Fullfilled = true
183+
return
184+
}
185+
} else {
186+
// We are after Deadline
187+
slog.Infof("force closing alert %v on delayed close because the alert is after the deadline", incident.AlertKey)
188+
incident.Actions[i].Fullfilled = true
189+
err = s.ActionByAlertKey("bosun", fmt.Sprintf("forceclose on behalf of delayed close by %v", action.User), models.ActionForceClose, nil, ak)
190+
if err != nil {
191+
return
192+
}
193+
incident, err = data.GetIncidentState(incident.Id)
194+
if err != nil {
195+
return
196+
}
197+
return
198+
}
199+
}
200+
}
201+
}
143202
// If nothing is out of the ordinary we are done
144203
if event.Status <= models.StNormal && incident == nil {
145204
return
@@ -248,7 +307,7 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
248307
if si := silenced(ak); si != nil && event.Status == models.StNormal {
249308
go func(ak models.AlertKey) {
250309
slog.Infof("auto close %s because was silenced", ak)
251-
err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, ak)
310+
err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, nil, ak)
252311
if err != nil {
253312
slog.Errorln(err)
254313
}
@@ -267,6 +326,7 @@ func silencedOrIgnored(a *conf.Alert, event *models.Event, si *models.Silence) b
267326
}
268327
return false
269328
}
329+
270330
func (s *Schedule) executeTemplates(state *models.IncidentState, rt *models.RenderedTemplates, event *models.Event, a *conf.Alert, r *RunHistory) {
271331
if event.Status != models.StUnknown {
272332
var errs []error

cmd/bosun/sched/check_test.go

Lines changed: 151 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func TestCheckFlapping(t *testing.T) {
8585
r.Events[ak].Status = models.StNormal
8686
s.RunHistory(r)
8787
// Close the alert, so it should notify next time.
88-
if err := s.ActionByAlertKey("", "", models.ActionClose, ak); err != nil {
88+
if err := s.ActionByAlertKey("", "", models.ActionClose, nil, ak); err != nil {
8989
t.Fatal(err)
9090
}
9191
r.Events[ak].Status = models.StWarning
@@ -141,6 +141,155 @@ func TestCheckSilence(t *testing.T) {
141141
}
142142
}
143143

144+
func TestDelayedClose(t *testing.T) {
145+
defer setup()()
146+
c, err := rule.NewConf("", conf.EnabledBackends{}, nil, `
147+
alert a {
148+
warn = 1
149+
crit = 1
150+
warnNotification = test
151+
critNotification = test
152+
template = test
153+
}
154+
template test {
155+
subject = test
156+
}
157+
notification test {
158+
print = true
159+
}
160+
`)
161+
if err != nil {
162+
t.Fatal(err)
163+
}
164+
s, _ := initSched(&conf.SystemConf{}, c)
165+
now := time.Now()
166+
ak := models.NewAlertKey("a", nil)
167+
r := &RunHistory{
168+
Start: now,
169+
Events: map[models.AlertKey]*models.Event{
170+
ak: {Status: models.StWarning},
171+
},
172+
}
173+
expect := func(id int64, active bool, open bool) {
174+
incident, err := s.DataAccess.State().GetLatestIncident(ak)
175+
if err != nil {
176+
t.Fatal(err)
177+
}
178+
if incident.Id != id {
179+
t.Fatalf("expected incident id %d. Got %d.", id, incident.Id)
180+
}
181+
if incident.IsActive() != active {
182+
t.Fatalf("expected incident active status to be %v but got %v", active, incident.IsActive())
183+
}
184+
if incident.Open != open {
185+
t.Fatalf("expected incident closed boolean to be %v but got %v", open, incident.Open)
186+
}
187+
}
188+
expectPendingNotifications := func(i int) {
189+
if len(s.pendingNotifications[s.RuleConf.GetNotification("test")]) != i {
190+
t.Fatalf("expencted %v pending notifications but got %v", i, len(s.pendingNotifications[s.RuleConf.GetNotification("test")]))
191+
}
192+
s.pendingNotifications = nil
193+
}
194+
advance := func(i int64) {
195+
r.Start = r.Start.Add(time.Second * time.Duration(i))
196+
}
197+
s.RunHistory(r)
198+
expect(1, true, true)
199+
expectPendingNotifications(1)
200+
s.pendingNotifications = nil
201+
202+
// Test case where close issue and alert goes to normal before deadline
203+
fiveMin := r.Start.Add(time.Minute * 5)
204+
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
205+
if err != nil {
206+
t.Fatal(err)
207+
}
208+
advance(1)
209+
s.RunHistory(r)
210+
expect(1, true, true)
211+
212+
r.Events[ak].Status = models.StNormal
213+
advance(1)
214+
s.RunHistory(r)
215+
expect(1, false, false)
216+
217+
r.Events[ak].Status = models.StWarning
218+
advance(1)
219+
s.RunHistory(r)
220+
expect(2, true, true)
221+
expectPendingNotifications(1)
222+
223+
// Test case where close issue and alert does not go normal before deadline
224+
// which should result in a force closing
225+
fiveMin = r.Start.Add(time.Minute * 5)
226+
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
227+
if err != nil {
228+
t.Fatal(err)
229+
}
230+
231+
advance(301)
232+
s.RunHistory(r)
233+
expect(2, true, false)
234+
235+
r.Events[ak].Status = models.StWarning
236+
advance(1)
237+
s.RunHistory(r)
238+
expect(3, true, true)
239+
expectPendingNotifications(1)
240+
241+
// Test cancelling a delayed close
242+
fiveMin = r.Start.Add(time.Minute * 5)
243+
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
244+
if err != nil {
245+
t.Fatal(err)
246+
}
247+
advance(1)
248+
s.RunHistory(r)
249+
expect(3, true, true)
250+
251+
err = s.ActionByAlertKey("", "", models.ActionCancelClose, nil, ak)
252+
if err != nil {
253+
t.Fatal(err)
254+
}
255+
advance(300)
256+
s.RunHistory(r)
257+
expect(3, true, true)
258+
259+
// Make sure delayed close works after a previous delayed close was cancelled
260+
fiveMin = r.Start.Add(time.Minute * 5)
261+
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
262+
if err != nil {
263+
t.Fatal(err)
264+
}
265+
advance(301)
266+
s.RunHistory(r)
267+
expect(3, true, false)
268+
269+
r.Events[ak].Status = models.StWarning
270+
advance(1)
271+
s.RunHistory(r)
272+
expect(4, true, true)
273+
expectPendingNotifications(1)
274+
275+
// Make sure escalation cancels a delayed close
276+
fiveMin = r.Start.Add(time.Minute * 5)
277+
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
278+
if err != nil {
279+
t.Fatal(err)
280+
}
281+
r.Events[ak].Status = models.StCritical
282+
advance(1)
283+
s.RunHistory(r)
284+
expect(4, true, true)
285+
expectPendingNotifications(1)
286+
287+
advance(300)
288+
s.RunHistory(r)
289+
expect(4, true, true)
290+
expectPendingNotifications(0)
291+
}
292+
144293
func TestIncidentIds(t *testing.T) {
145294
defer setup()()
146295
c, err := rule.NewConf("", conf.EnabledBackends{}, nil, `
@@ -180,7 +329,7 @@ func TestIncidentIds(t *testing.T) {
180329

181330
r.Events[ak].Status = models.StNormal
182331
s.RunHistory(r)
183-
err = s.ActionByAlertKey("", "", models.ActionClose, ak)
332+
err = s.ActionByAlertKey("", "", models.ActionClose, nil, ak)
184333
if err != nil {
185334
t.Fatal(err)
186335
}

cmd/bosun/sched/notify.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ import (
1313
"bosun.org/slog"
1414
)
1515

16+
// dispatchNotifications triggers notification checks at 2x the the system configuration's
17+
// check frequency, when something has signaled the schedule via the nc channels, or when
18+
// a notification that was scheduled in the future due to a notification chain
1619
func (s *Schedule) dispatchNotifications() {
1720
ticker := time.NewTicker(s.SystemConf.GetCheckFrequency() * 2)
1821
var next <-chan time.Time
@@ -42,6 +45,7 @@ type IncidentWithTemplates struct {
4245
*models.RenderedTemplates
4346
}
4447

48+
// Notify puts a rendered notification in the schedule's pendingNotifications queue
4549
func (s *Schedule) Notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) {
4650
it := &IncidentWithTemplates{}
4751
it.IncidentState = st
@@ -119,6 +123,10 @@ func (s *Schedule) CheckNotifications() time.Time {
119123
return timeout
120124
}
121125

126+
// sendNotifications processes the schedule's pendingNotifications queue. It silences notifications,
127+
// moves unknown notifications to the unknownNotifications queue so they can be grouped, calls the notification
128+
// Notify method to trigger notification actions, and queues notifications that are in the future because they
129+
// are part of a notification chain
122130
func (s *Schedule) sendNotifications(silenced SilenceTester) {
123131
if s.quiet {
124132
slog.Infoln("quiet mode prevented", len(s.pendingNotifications), "notifications")
@@ -157,6 +165,9 @@ func (s *Schedule) sendNotifications(silenced SilenceTester) {
157165
}
158166
}
159167

168+
// sendUnknownNotifications processes the schedule's pendingUnknowns queue. It puts unknowns into groups
169+
// to be processed by the schedule's utnotify method. When it is done processing the pendingUnknowns queue
170+
// it reinitializes the queue.
160171
func (s *Schedule) sendUnknownNotifications() {
161172
slog.Info("Batching and sending unknown notifications")
162173
defer slog.Info("Done sending unknown notifications")
@@ -207,6 +218,8 @@ var unknownMultiGroup = ttemplate.Must(ttemplate.New("unknownMultiGroup").Parse(
207218
</ul>
208219
`))
209220

221+
// notify is a wrapper for the notifications Notify method that sets the EmailSubject and EmailBody for the rendered
222+
// template. It passes properties from the schedule that the Notification's Notify method requires.
210223
func (s *Schedule) notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) {
211224
if len(rt.EmailSubject) == 0 {
212225
rt.EmailSubject = []byte(st.Subject)
@@ -252,6 +265,8 @@ var defaultUnknownTemplate = &conf.Template{
252265
Subject: ttemplate.Must(ttemplate.New("").Parse(`{{.Name}}: {{.Group | len}} unknown alerts`)),
253266
}
254267

268+
// unotify builds an unknown notification for an alertkey or a group of alert keys. It renders the template
269+
// and calls the notification's Notify method to trigger the action.
255270
func (s *Schedule) unotify(name string, group models.AlertKeys, n *conf.Notification) {
256271
subject := new(bytes.Buffer)
257272
body := new(bytes.Buffer)
@@ -275,6 +290,8 @@ func (s *Schedule) unotify(name string, group models.AlertKeys, n *conf.Notifica
275290
n.Notify(subject.String(), body.String(), subject.Bytes(), body.Bytes(), s.SystemConf, name)
276291
}
277292

293+
// QueueNotification persists a notification to the datastore to be sent in the future. This happens when
294+
// there are notification chains or an alert is unevaluated due to a dependency.
278295
func (s *Schedule) QueueNotification(ak models.AlertKey, n *conf.Notification, started time.Time) error {
279296
return s.DataAccess.Notifications().InsertNotification(ak, n.Name, started.Add(n.Timeout))
280297
}

0 commit comments

Comments
 (0)