Skip to content
Permalink
Browse files
cmd/bosun: close on active results in delayed close
Documentation and fetching the default delayed close period to come in future commit. 

This also includes commenting existing functions in for notifications and some typescript objects in place of "any"
  • Loading branch information
kylebrandt committed May 5, 2017
1 parent 5e92004 commit 90cc3a9
Show file tree
Hide file tree
Showing 20 changed files with 5,299 additions and 4,547 deletions.
@@ -28,6 +28,7 @@ func init() {
"The number of email notifications that Bosun failed to send.")
}

// Notify triggers Email/HTTP/Print actions for the Notification object
func (n *Notification) Notify(subject, body string, emailsubject, emailbody []byte, c SystemConfProvider, ak string, attachments ...*models.Attachment) {
if len(n.Email) > 0 {
go n.DoEmail(emailsubject, emailbody, c, ak, attachments...)
@@ -122,12 +122,7 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
if err != nil {
return
}
if incident != nil {
rt, err = data.GetRenderedTemplates(incident.Id)
if err != nil {
return
}
}

defer func() {
// save unless incident is new and closed (log alert)
if incident != nil && (incident.Id != 0 || incident.Open) {
@@ -140,6 +135,70 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
}
}
}()
if incident != nil {
rt, err = data.GetRenderedTemplates(incident.Id)
if err != nil {
return
}
for i, action := range incident.Actions {
if action.Type == models.ActionDelayedClose && !(action.Fullfilled || action.Cancelled) {
if event.Status > incident.WorstStatus {
// If the lifetime severity of the incident has increased, cancel the delayed close
err = s.ActionByAlertKey("bosun", "cancelled delayed close due to severity increase", models.ActionCancelClose, nil, ak)
if err != nil {
return
}
incident, err = data.GetIncidentState(incident.Id)
if err != nil {
return
}
// Continue processing alert after cancelling the delayed close
break
}
if action.Deadline == nil {
err = fmt.Errorf("should not be here - cancelled close without deadline")
return
}
if r.Start.Before(*action.Deadline) {
if event.Status == models.StNormal {
slog.Infof("closing alert %v on delayed close because the alert has returned to normal before deadline", incident.AlertKey)
if event.Status != incident.CurrentStatus {
incident.Events = append(incident.Events, *event)
}
incident.CurrentStatus = event.Status
// Action needs to know it is normal, so update the incident that action will read
_, err = data.UpdateIncidentState(incident)
if err != nil {
return
}
err = s.ActionByAlertKey("bosun", fmt.Sprintf("close on behalf of delayed close by %v", action.User), models.ActionClose, nil, ak)
if err != nil {
return
}
incident, err = data.GetIncidentState(incident.Id)
if err != nil {
return
}
incident.Actions[i].Fullfilled = true
return
}
} else {
// We are after Deadline
slog.Infof("force closing alert %v on delayed close because the alert is after the deadline", incident.AlertKey)
incident.Actions[i].Fullfilled = true
err = s.ActionByAlertKey("bosun", fmt.Sprintf("forceclose on behalf of delayed close by %v", action.User), models.ActionForceClose, nil, ak)
if err != nil {
return
}
incident, err = data.GetIncidentState(incident.Id)
if err != nil {
return
}
return
}
}
}
}
// If nothing is out of the ordinary we are done
if event.Status <= models.StNormal && incident == nil {
return
@@ -248,7 +307,7 @@ func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.E
if si := silenced(ak); si != nil && event.Status == models.StNormal {
go func(ak models.AlertKey) {
slog.Infof("auto close %s because was silenced", ak)
err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, ak)
err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, nil, ak)
if err != nil {
slog.Errorln(err)
}
@@ -267,6 +326,7 @@ func silencedOrIgnored(a *conf.Alert, event *models.Event, si *models.Silence) b
}
return false
}

func (s *Schedule) executeTemplates(state *models.IncidentState, rt *models.RenderedTemplates, event *models.Event, a *conf.Alert, r *RunHistory) {
if event.Status != models.StUnknown {
var errs []error
@@ -85,7 +85,7 @@ func TestCheckFlapping(t *testing.T) {
r.Events[ak].Status = models.StNormal
s.RunHistory(r)
// Close the alert, so it should notify next time.
if err := s.ActionByAlertKey("", "", models.ActionClose, ak); err != nil {
if err := s.ActionByAlertKey("", "", models.ActionClose, nil, ak); err != nil {
t.Fatal(err)
}
r.Events[ak].Status = models.StWarning
@@ -141,6 +141,155 @@ func TestCheckSilence(t *testing.T) {
}
}

func TestDelayedClose(t *testing.T) {
defer setup()()
c, err := rule.NewConf("", conf.EnabledBackends{}, nil, `
alert a {
warn = 1
crit = 1
warnNotification = test
critNotification = test
template = test
}
template test {
subject = test
}
notification test {
print = true
}
`)
if err != nil {
t.Fatal(err)
}
s, _ := initSched(&conf.SystemConf{}, c)
now := time.Now()
ak := models.NewAlertKey("a", nil)
r := &RunHistory{
Start: now,
Events: map[models.AlertKey]*models.Event{
ak: {Status: models.StWarning},
},
}
expect := func(id int64, active bool, open bool) {
incident, err := s.DataAccess.State().GetLatestIncident(ak)
if err != nil {
t.Fatal(err)
}
if incident.Id != id {
t.Fatalf("expected incident id %d. Got %d.", id, incident.Id)
}
if incident.IsActive() != active {
t.Fatalf("expected incident active status to be %v but got %v", active, incident.IsActive())
}
if incident.Open != open {
t.Fatalf("expected incident closed boolean to be %v but got %v", open, incident.Open)
}
}
expectPendingNotifications := func(i int) {
if len(s.pendingNotifications[s.RuleConf.GetNotification("test")]) != i {
t.Fatalf("expencted %v pending notifications but got %v", i, len(s.pendingNotifications[s.RuleConf.GetNotification("test")]))
}
s.pendingNotifications = nil
}
advance := func(i int64) {
r.Start = r.Start.Add(time.Second * time.Duration(i))
}
s.RunHistory(r)
expect(1, true, true)
expectPendingNotifications(1)
s.pendingNotifications = nil

// Test case where close issue and alert goes to normal before deadline
fiveMin := r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
advance(1)
s.RunHistory(r)
expect(1, true, true)

r.Events[ak].Status = models.StNormal
advance(1)
s.RunHistory(r)
expect(1, false, false)

r.Events[ak].Status = models.StWarning
advance(1)
s.RunHistory(r)
expect(2, true, true)
expectPendingNotifications(1)

// Test case where close issue and alert does not go normal before deadline
// which should result in a force closing
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}

advance(301)
s.RunHistory(r)
expect(2, true, false)

r.Events[ak].Status = models.StWarning
advance(1)
s.RunHistory(r)
expect(3, true, true)
expectPendingNotifications(1)

// Test cancelling a delayed close
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
advance(1)
s.RunHistory(r)
expect(3, true, true)

err = s.ActionByAlertKey("", "", models.ActionCancelClose, nil, ak)
if err != nil {
t.Fatal(err)
}
advance(300)
s.RunHistory(r)
expect(3, true, true)

// Make sure delayed close works after a previous delayed close was cancelled
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
advance(301)
s.RunHistory(r)
expect(3, true, false)

r.Events[ak].Status = models.StWarning
advance(1)
s.RunHistory(r)
expect(4, true, true)
expectPendingNotifications(1)

// Make sure escalation cancels a delayed close
fiveMin = r.Start.Add(time.Minute * 5)
err = s.ActionByAlertKey("", "", models.ActionClose, &fiveMin, ak)
if err != nil {
t.Fatal(err)
}
r.Events[ak].Status = models.StCritical
advance(1)
s.RunHistory(r)
expect(4, true, true)
expectPendingNotifications(1)

advance(300)
s.RunHistory(r)
expect(4, true, true)
expectPendingNotifications(0)
}

func TestIncidentIds(t *testing.T) {
defer setup()()
c, err := rule.NewConf("", conf.EnabledBackends{}, nil, `
@@ -180,7 +329,7 @@ func TestIncidentIds(t *testing.T) {

r.Events[ak].Status = models.StNormal
s.RunHistory(r)
err = s.ActionByAlertKey("", "", models.ActionClose, ak)
err = s.ActionByAlertKey("", "", models.ActionClose, nil, ak)
if err != nil {
t.Fatal(err)
}
@@ -13,6 +13,9 @@ import (
"bosun.org/slog"
)

// dispatchNotifications triggers notification checks at 2x the the system configuration's
// check frequency, when something has signaled the schedule via the nc channels, or when
// a notification that was scheduled in the future due to a notification chain
func (s *Schedule) dispatchNotifications() {
ticker := time.NewTicker(s.SystemConf.GetCheckFrequency() * 2)
var next <-chan time.Time
@@ -42,6 +45,7 @@ type IncidentWithTemplates struct {
*models.RenderedTemplates
}

// Notify puts a rendered notification in the schedule's pendingNotifications queue
func (s *Schedule) Notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) {
it := &IncidentWithTemplates{}
it.IncidentState = st
@@ -119,6 +123,10 @@ func (s *Schedule) CheckNotifications() time.Time {
return timeout
}

// sendNotifications processes the schedule's pendingNotifications queue. It silences notifications,
// moves unknown notifications to the unknownNotifications queue so they can be grouped, calls the notification
// Notify method to trigger notification actions, and queues notifications that are in the future because they
// are part of a notification chain
func (s *Schedule) sendNotifications(silenced SilenceTester) {
if s.quiet {
slog.Infoln("quiet mode prevented", len(s.pendingNotifications), "notifications")
@@ -157,6 +165,9 @@ func (s *Schedule) sendNotifications(silenced SilenceTester) {
}
}

// sendUnknownNotifications processes the schedule's pendingUnknowns queue. It puts unknowns into groups
// to be processed by the schedule's utnotify method. When it is done processing the pendingUnknowns queue
// it reinitializes the queue.
func (s *Schedule) sendUnknownNotifications() {
slog.Info("Batching and sending unknown notifications")
defer slog.Info("Done sending unknown notifications")
@@ -207,6 +218,8 @@ var unknownMultiGroup = ttemplate.Must(ttemplate.New("unknownMultiGroup").Parse(
</ul>
`))

// notify is a wrapper for the notifications Notify method that sets the EmailSubject and EmailBody for the rendered
// template. It passes properties from the schedule that the Notification's Notify method requires.
func (s *Schedule) notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) {
if len(rt.EmailSubject) == 0 {
rt.EmailSubject = []byte(st.Subject)
@@ -252,6 +265,8 @@ var defaultUnknownTemplate = &conf.Template{
Subject: ttemplate.Must(ttemplate.New("").Parse(`{{.Name}}: {{.Group | len}} unknown alerts`)),
}

// unotify builds an unknown notification for an alertkey or a group of alert keys. It renders the template
// and calls the notification's Notify method to trigger the action.
func (s *Schedule) unotify(name string, group models.AlertKeys, n *conf.Notification) {
subject := new(bytes.Buffer)
body := new(bytes.Buffer)
@@ -275,6 +290,8 @@ func (s *Schedule) unotify(name string, group models.AlertKeys, n *conf.Notifica
n.Notify(subject.String(), body.String(), subject.Bytes(), body.Bytes(), s.SystemConf, name)
}

// QueueNotification persists a notification to the datastore to be sent in the future. This happens when
// there are notification chains or an alert is unevaluated due to a dependency.
func (s *Schedule) QueueNotification(ak models.AlertKey, n *conf.Notification, started time.Time) error {
return s.DataAccess.Notifications().InsertNotification(ak, n.Name, started.Add(n.Timeout))
}

0 comments on commit 90cc3a9

Please sign in to comment.