Skip to content

Commit

Permalink
cmd/bosun: close on active results in delayed close
Browse files Browse the repository at this point in the history
Documentation and fetching the default delayed close period to come in future commit. 

This also includes commenting existing functions in for notifications and some typescript objects in place of "any"
  • Loading branch information
kylebrandt committed May 5, 2017
1 parent 5e92004 commit 90cc3a9
Show file tree
Hide file tree
Showing 20 changed files with 5,299 additions and 4,547 deletions.
1 change: 1 addition & 0 deletions cmd/bosun/conf/notify.go
Expand Up @@ -28,6 +28,7 @@ func init() {
"The number of email notifications that Bosun failed to send.")
}

// Notify triggers Email/HTTP/Print actions for the Notification object
func (n *Notification) Notify(subject, body string, emailsubject, emailbody []byte, c SystemConfProvider, ak string, attachments ...*models.Attachment) {
if len(n.Email) > 0 {
go n.DoEmail(emailsubject, emailbody, c, ak, attachments...)
Expand Down
74 changes: 67 additions & 7 deletions cmd/bosun/sched/check.go
Expand Up @@ -122,12 +122,7 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
if err != nil {
return
}
if incident != nil {
rt, err = data.GetRenderedTemplates(incident.Id)
if err != nil {
return
}
}

defer func() {
// save unless incident is new and closed (log alert)
if incident != nil && (incident.Id != 0 || incident.Open) {
Expand All @@ -140,6 +135,70 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
}
}
}()
if incident != nil {
rt, err = data.GetRenderedTemplates(incident.Id)
if err != nil {
return
}
for i, action := range incident.Actions {
if action.Type == models.ActionDelayedClose && !(action.Fullfilled || action.Cancelled) {
if event.Status > incident.WorstStatus {
// If the lifetime severity of the incident has increased, cancel the delayed close
err = s.ActionByAlertKey("bosun", "cancelled delayed close due to severity increase", models.ActionCancelClose, nil, ak)
if err != nil {
return
}
incident, err = data.GetIncidentState(incident.Id)
if err != nil {
return
}
// Continue processing alert after cancelling the delayed close
break
}
if action.Deadline == nil {
err = fmt.Errorf("should not be here - cancelled close without deadline")
return
}
if r.Start.Before(*action.Deadline) {
if event.Status == models.StNormal {
slog.Infof("closing alert %v on delayed close because the alert has returned to normal before deadline", incident.AlertKey)
if event.Status != incident.CurrentStatus {
incident.Events = append(incident.Events, *event)
}
incident.CurrentStatus = event.Status
// Action needs to know it is normal, so update the incident that action will read
_, err = data.UpdateIncidentState(incident)
if err != nil {
return
}
err = s.ActionByAlertKey("bosun", fmt.Sprintf("close on behalf of delayed close by %v", action.User), models.ActionClose, nil, ak)
if err != nil {
return
}
incident, err = data.GetIncidentState(incident.Id)
if err != nil {
return
}
incident.Actions[i].Fullfilled = true
return
}
} else {
// We are after Deadline
slog.Infof("force closing alert %v on delayed close because the alert is after the deadline", incident.AlertKey)
incident.Actions[i].Fullfilled = true
err = s.ActionByAlertKey("bosun", fmt.Sprintf("forceclose on behalf of delayed close by %v", action.User), models.ActionForceClose, nil, ak)
if err != nil {
return
}
incident, err = data.GetIncidentState(incident.Id)
if err != nil {
return
}
return
}
}
}
}
// If nothing is out of the ordinary we are done
if event.Status <= models.StNormal && incident == nil {
return
Expand Down Expand Up @@ -248,7 +307,7 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
if si := silenced(ak); si != nil && event.Status == models.StNormal {
go func(ak models.AlertKey) {
slog.Infof("auto close %s because was silenced", ak)
err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, ak)
err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, nil, ak)
if err != nil {
slog.Errorln(err)
}
Expand All @@ -267,6 +326,7 @@ func silencedOrIgnored(a *conf.Alert, event *models.Event, si *models.Silence) b
}
return false
}

func (s *Schedule) executeTemplates(state *models.IncidentState, rt *models.RenderedTemplates, event *models.Event, a *conf.Alert, r *RunHistory) {
if event.Status != models.StUnknown {
var errs []error
Expand Down
153 changes: 151 additions & 2 deletions cmd/bosun/sched/check_test.go
Expand Up @@ -85,7 +85,7 @@ func TestCheckFlapping(t *testing.T) {
r.Events[ak].Status = models.StNormal
s.RunHistory(r)
// Close the alert, so it should notify next time.
if err := s.ActionByAlertKey("", "", models.ActionClose, ak); err != nil {
if err := s.ActionByAlertKey("", "", models.ActionClose, nil, ak); err != nil {
t.Fatal(err)
}
r.Events[ak].Status = models.StWarning
Expand Down Expand Up @@ -141,6 +141,155 @@ func TestCheckSilence(t *testing.T) {
}
}

func TestDelayedClose(t *testing.T) {
defer setup()()
c, err := rule.NewConf("", conf.EnabledBackends{}, nil, `
alert a {
warn = 1
crit = 1
warnNotification = test
critNotification = test
template = test
}
template test {
subject = test
}
notification test {
print = true
}
`)
if err != nil {
t.Fatal(err)
}
s, _ := initSched(&conf.SystemConf{}, c)
now := time.Now()
ak := models.NewAlertKey("a", nil)
r := &RunHistory{
Start: now,
Events: map[models.AlertKey]*models.Event{
ak: {Status: models.StWarning},
},
}
expect := func(id int64, active bool, open bool) {
incident, err := s.DataAccess.State().GetLatestIncident(ak)
if err != nil {
t.Fatal(err)
}
if incident.Id != id {
t.Fatalf("expected incident id %d. Got %d.", id, incident.Id)
}
if incident.IsActive() != active {
t.Fatalf("expected incident active status to be %v but got %v", active, incident.IsActive())
}
if incident.Open != open {
t.Fatalf("expected incident closed boolean to be %v but got %v", open, incident.Open)
}
}
expectPendingNotifications := func(i int) {
if len(s.pendingNotifications[s.RuleConf.GetNotification("test")]) != i {
t.Fatalf("expencted %v pending notifications but got %v", i, len(s.pendingNotifications[s.RuleConf.GetNotification("test")]))
}
s.pendingNotifications = nil
}
advance := func(i int64) {
r.Start = r.Start.Add(time.Second * time.Duration(i))
}
s.RunHistory(r)
expect(1, true, true)
expectPendingNotifications(1)
s.pendingNotifications = nil

// Test case where close issue and alert goes to normal before deadline
fiveMin := r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
advance(1)
s.RunHistory(r)
expect(1, true, true)

r.Events[ak].Status = models.StNormal
advance(1)
s.RunHistory(r)
expect(1, false, false)

r.Events[ak].Status = models.StWarning
advance(1)
s.RunHistory(r)
expect(2, true, true)
expectPendingNotifications(1)

// Test case where close issue and alert does not go normal before deadline
// which should result in a force closing
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}

advance(301)
s.RunHistory(r)
expect(2, true, false)

r.Events[ak].Status = models.StWarning
advance(1)
s.RunHistory(r)
expect(3, true, true)
expectPendingNotifications(1)

// Test cancelling a delayed close
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
advance(1)
s.RunHistory(r)
expect(3, true, true)

err = s.ActionByAlertKey("", "", models.ActionCancelClose, nil, ak)
if err != nil {
t.Fatal(err)
}
advance(300)
s.RunHistory(r)
expect(3, true, true)

// Make sure delayed close works after a previous delayed close was cancelled
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
advance(301)
s.RunHistory(r)
expect(3, true, false)

r.Events[ak].Status = models.StWarning
advance(1)
s.RunHistory(r)
expect(4, true, true)
expectPendingNotifications(1)

// Make sure escalation cancels a delayed close
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
r.Events[ak].Status = models.StCritical
advance(1)
s.RunHistory(r)
expect(4, true, true)
expectPendingNotifications(1)

advance(300)
s.RunHistory(r)
expect(4, true, true)
expectPendingNotifications(0)
}

func TestIncidentIds(t *testing.T) {
defer setup()()
c, err := rule.NewConf("", conf.EnabledBackends{}, nil, `
Expand Down Expand Up @@ -180,7 +329,7 @@ func TestIncidentIds(t *testing.T) {

r.Events[ak].Status = models.StNormal
s.RunHistory(r)
err = s.ActionByAlertKey("", "", models.ActionClose, ak)
err = s.ActionByAlertKey("", "", models.ActionClose, nil, ak)
if err != nil {
t.Fatal(err)
}
Expand Down
17 changes: 17 additions & 0 deletions cmd/bosun/sched/notify.go
Expand Up @@ -13,6 +13,9 @@ import (
"bosun.org/slog"
)

// dispatchNotifications triggers notification checks at 2x the the system configuration's
// check frequency, when something has signaled the schedule via the nc channels, or when
// a notification that was scheduled in the future due to a notification chain
func (s *Schedule) dispatchNotifications() {
ticker := time.NewTicker(s.SystemConf.GetCheckFrequency() * 2)
var next <-chan time.Time
Expand Down Expand Up @@ -42,6 +45,7 @@ type IncidentWithTemplates struct {
*models.RenderedTemplates
}

// Notify puts a rendered notification in the schedule's pendingNotifications queue
func (s *Schedule) Notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) {
it := &IncidentWithTemplates{}
it.IncidentState = st
Expand Down Expand Up @@ -119,6 +123,10 @@ func (s *Schedule) CheckNotifications() time.Time {
return timeout
}

// sendNotifications processes the schedule's pendingNotifications queue. It silences notifications,
// moves unknown notifications to the unknownNotifications queue so they can be grouped, calls the notification
// Notify method to trigger notification actions, and queues notifications that are in the future because they
// are part of a notification chain
func (s *Schedule) sendNotifications(silenced SilenceTester) {
if s.quiet {
slog.Infoln("quiet mode prevented", len(s.pendingNotifications), "notifications")
Expand Down Expand Up @@ -157,6 +165,9 @@ func (s *Schedule) sendNotifications(silenced SilenceTester) {
}
}

// sendUnknownNotifications processes the schedule's pendingUnknowns queue. It puts unknowns into groups
// to be processed by the schedule's utnotify method. When it is done processing the pendingUnknowns queue
// it reinitializes the queue.
func (s *Schedule) sendUnknownNotifications() {
slog.Info("Batching and sending unknown notifications")
defer slog.Info("Done sending unknown notifications")
Expand Down Expand Up @@ -207,6 +218,8 @@ var unknownMultiGroup = ttemplate.Must(ttemplate.New("unknownMultiGroup").Parse(
</ul>
`))

// notify is a wrapper for the notifications Notify method that sets the EmailSubject and EmailBody for the rendered
// template. It passes properties from the schedule that the Notification's Notify method requires.
func (s *Schedule) notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) {
if len(rt.EmailSubject) == 0 {
rt.EmailSubject = []byte(st.Subject)
Expand Down Expand Up @@ -252,6 +265,8 @@ var defaultUnknownTemplate = &conf.Template{
Subject: ttemplate.Must(ttemplate.New("").Parse(`{{.Name}}: {{.Group | len}} unknown alerts`)),
}

// unotify builds an unknown notification for an alertkey or a group of alert keys. It renders the template
// and calls the notification's Notify method to trigger the action.
func (s *Schedule) unotify(name string, group models.AlertKeys, n *conf.Notification) {
subject := new(bytes.Buffer)
body := new(bytes.Buffer)
Expand All @@ -275,6 +290,8 @@ func (s *Schedule) unotify(name string, group models.AlertKeys, n *conf.Notifica
n.Notify(subject.String(), body.String(), subject.Bytes(), body.Bytes(), s.SystemConf, name)
}

// QueueNotification persists a notification to the datastore to be sent in the future. This happens when
// there are notification chains or an alert is unevaluated due to a dependency.
func (s *Schedule) QueueNotification(ak models.AlertKey, n *conf.Notification, started time.Time) error {
return s.DataAccess.Notifications().InsertNotification(ak, n.Name, started.Add(n.Timeout))
}
Expand Down

0 comments on commit 90cc3a9

Please sign in to comment.