Skip to content

Commit

Permalink
Support dynamically configure alarm settings (#3557)
Browse files Browse the repository at this point in the history
* Support dynamically configure alarm settings

* Update documentation
  • Loading branch information
kezhenxu94 authored and wu-sheng committed Oct 5, 2019
1 parent 81f4c08 commit 2801de7
Show file tree
Hide file tree
Showing 16 changed files with 429 additions and 67 deletions.
1 change: 1 addition & 0 deletions Jenkinsfile
Expand Up @@ -25,6 +25,7 @@ pipeline {
))
timestamps()
skipStagesAfterUnstable()
timeout(time: 5, unit: 'HOURS')
}

environment {
Expand Down
5 changes: 5 additions & 0 deletions Jenkinsfile-E2E
Expand Up @@ -21,6 +21,11 @@ pipeline {
label 'skywalking'
}

options {
timestamps()
timeout(time: 5, unit: 'HOURS')
}

tools {
jdk 'JDK 1.8 (latest)'
}
Expand Down
10 changes: 9 additions & 1 deletion docs/en/setup/backend/backend-alarm.md
@@ -1,5 +1,5 @@
# Alarm
Alarm core is driven a collection of rules, which are defined in `config/alarm-settings.yml`.
Alarm core is driven by a collection of rules, which are defined in `config/alarm-settings.yml`.
There are two parts in alarm rule definition.
1. [Alarm rules](#rules). They define how metrics alarm should be triggered, what conditions should be considered.
1. [Webhooks](#webhook). The list of web service endpoint, which should be called after the alarm is triggered.
Expand Down Expand Up @@ -90,3 +90,11 @@ Example as following
"startTime": 1560524171000
}]
```

## Update the settings dynamically
Since 6.5.0, the alarm settings can be updated dynamically at runtime by [Dynamic Configuration](dynamic-config.md),
which will override the settings in `alarm-settings.yml`.

In order to determine that whether an alarm rule is triggered or not, SkyWalking needs to cache the metrics of a time window for
each alarm rule, if any attribute (`metrics-name`, `op`, `threshold`, `period`, `count`, etc.) of a rule is changed,
the sliding window will be destroyed and re-created, causing the alarm of this specific rule to restart again.
3 changes: 2 additions & 1 deletion docs/en/setup/backend/dynamic-config.md
Expand Up @@ -7,7 +7,8 @@ Right now, SkyWalking supports following dynamic configurations.
| Config Key | Value Description | Value Format Example |
|:----:|:----:|:----:|
|receiver-trace.default.slowDBAccessThreshold| Thresholds of slow Database statement, override `receiver-trace/default/slowDBAccessThreshold` of `applciation.yml`. | default:200,mongodb:50|
|receiver-trace.default.uninstrumentedGateways| The uninstrumented gateways, override `gateways.yml`. | not set |
|receiver-trace.default.uninstrumentedGateways| The uninstrumented gateways, override `gateways.yml`. | same as [`gateways.yml`](uninstrumented-gateways.md#configuration-format) |
|alarm.default.alarm-settings| The alarm settings, will override `alarm-settings.yml`. | same as [`alarm-settings.yml`](backend-alarm.md) |


This feature depends on upstream service, so it is **OFF** as default.
Expand Down
Expand Up @@ -18,11 +18,17 @@

package org.apache.skywalking.oap.server.core.alarm.provider;

import java.util.*;
import java.util.concurrent.*;
import org.apache.skywalking.oap.server.core.alarm.*;
import org.joda.time.*;
import org.slf4j.*;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.joda.time.LocalDateTime;
import org.joda.time.Minutes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Alarm core includes metrics values in certain time windows based on alarm settings. By using its internal timer
Expand All @@ -33,24 +39,15 @@
public class AlarmCore {
private static final Logger logger = LoggerFactory.getLogger(AlarmCore.class);

private Map<String, List<RunningRule>> runningContext;
private LocalDateTime lastExecuteTime;
private AlarmRulesWatcher alarmRulesWatcher;

AlarmCore(Rules rules) {
runningContext = new HashMap<>();
rules.getRules().forEach(rule -> {
RunningRule runningRule = new RunningRule(rule);

String metricsName = rule.getMetricsName();

List<RunningRule> runningRules = runningContext.computeIfAbsent(metricsName, key -> new ArrayList<>());

runningRules.add(runningRule);
});
AlarmCore(AlarmRulesWatcher alarmRulesWatcher) {
this.alarmRulesWatcher = alarmRulesWatcher;
}

public List<RunningRule> findRunningRule(String metricsName) {
return runningContext.get(metricsName);
return alarmRulesWatcher.getRunningContext().get(metricsName);
}

public void start(List<AlarmCallback> allCallbacks) {
Expand All @@ -62,10 +59,10 @@ public void start(List<AlarmCallback> allCallbacks) {
LocalDateTime checkTime = LocalDateTime.now();
int minutes = Minutes.minutesBetween(lastExecuteTime, checkTime).getMinutes();
boolean[] hasExecute = new boolean[] {false};
runningContext.values().forEach(ruleList -> ruleList.forEach(runningRule -> {
alarmRulesWatcher.getRunningContext().values().forEach(ruleList -> ruleList.forEach(runningRule -> {
if (minutes > 0) {
runningRule.moveTo(checkTime);
/**
/*
* Don't run in the first quarter per min, avoid to trigger false alarm.
*/
if (checkTime.getSecondOfMinute() > 15) {
Expand Down
Expand Up @@ -19,6 +19,9 @@
package org.apache.skywalking.oap.server.core.alarm.provider;

import java.io.*;

import org.apache.skywalking.oap.server.configuration.api.ConfigurationModule;
import org.apache.skywalking.oap.server.configuration.api.DynamicConfigurationService;
import org.apache.skywalking.oap.server.core.CoreModule;
import org.apache.skywalking.oap.server.core.alarm.*;
import org.apache.skywalking.oap.server.library.module.*;
Expand All @@ -27,6 +30,7 @@
public class AlarmModuleProvider extends ModuleProvider {

private NotifyHandler notifyHandler;
private AlarmRulesWatcher alarmRulesWatcher;

@Override public String name() {
return "default";
Expand All @@ -49,19 +53,24 @@ public class AlarmModuleProvider extends ModuleProvider {
}
RulesReader reader = new RulesReader(applicationReader);
Rules rules = reader.readRules();
notifyHandler = new NotifyHandler(rules);

alarmRulesWatcher = new AlarmRulesWatcher(rules, this);

notifyHandler = new NotifyHandler(alarmRulesWatcher);
notifyHandler.init(new AlarmStandardPersistence());
this.registerServiceImplementation(MetricsNotify.class, notifyHandler);
}

@Override public void start() throws ServiceNotProvidedException, ModuleStartException {
DynamicConfigurationService dynamicConfigurationService = getManager().find(ConfigurationModule.NAME).provider().getService(DynamicConfigurationService.class);
dynamicConfigurationService.registerConfigChangeWatcher(alarmRulesWatcher);
}

@Override public void notifyAfterCompleted() throws ServiceNotProvidedException, ModuleStartException {
notifyHandler.initCache(getManager());
}

@Override public String[] requiredModules() {
return new String[] {CoreModule.NAME};
return new String[] {CoreModule.NAME, ConfigurationModule.NAME};
}
}
Expand Up @@ -19,13 +19,21 @@
package org.apache.skywalking.oap.server.core.alarm.provider;

import java.util.ArrayList;
import java.util.Objects;

import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

/**
* @author wusheng
*/
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Setter(AccessLevel.PUBLIC)
@Getter(AccessLevel.PUBLIC)
public class AlarmRule {
Expand All @@ -39,4 +47,32 @@ public class AlarmRule {
private int count;
private int silencePeriod;
private String message;

@Override
public boolean equals(final Object o) {
if (this == o) {
return true;
}

if (o == null || getClass() != o.getClass()) {
return false;
}

final AlarmRule alarmRule = (AlarmRule) o;

return period == alarmRule.period
&& count == alarmRule.count
&& silencePeriod == alarmRule.silencePeriod
&& Objects.equals(alarmRuleName, alarmRule.alarmRuleName)
&& Objects.equals(metricsName, alarmRule.metricsName)
&& Objects.equals(includeNames, alarmRule.includeNames)
&& Objects.equals(threshold, alarmRule.threshold)
&& Objects.equals(op, alarmRule.op)
&& Objects.equals(message, alarmRule.message);
}

@Override
public int hashCode() {
return Objects.hash(alarmRuleName, metricsName, includeNames, threshold, op, period, count, silencePeriod, message);
}
}
@@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package org.apache.skywalking.oap.server.core.alarm.provider;

import lombok.Getter;
import org.apache.skywalking.oap.server.configuration.api.ConfigChangeWatcher;
import org.apache.skywalking.oap.server.core.Const;
import org.apache.skywalking.oap.server.core.alarm.AlarmModule;
import org.apache.skywalking.oap.server.library.module.ModuleProvider;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Alarm rules' settings can be dynamically updated via configuration center(s),
* this class is responsible for monitoring the configuration and parsing them
* into {@link Rules} and {@link #runningContext}.
*
* @author kezhenxu94
* @since 6.5.0
*/
public class AlarmRulesWatcher extends ConfigChangeWatcher {
@Getter
private volatile Map<String, List<RunningRule>> runningContext;
private volatile Map<AlarmRule, RunningRule> alarmRuleRunningRuleMap;
private volatile Rules rules;
private volatile String settingsString;

public AlarmRulesWatcher(Rules defaultRules, ModuleProvider provider) {
super(AlarmModule.NAME, provider, "alarm-settings");
this.runningContext = new HashMap<>();
this.alarmRuleRunningRuleMap = new HashMap<>();
this.settingsString = Const.EMPTY_STRING;

notify(defaultRules);
}

@Override
public void notify(ConfigChangeEvent value) {
if (value.getEventType() == EventType.DELETE) {
settingsString = Const.EMPTY_STRING;
notify(new Rules());
} else {
settingsString = value.getNewValue();
RulesReader rulesReader = new RulesReader(new StringReader(value.getNewValue()));
Rules rules = rulesReader.readRules();
notify(rules);
}
}

void notify(Rules newRules) {
Map<AlarmRule, RunningRule> newAlarmRuleRunningRuleMap = new HashMap<>();
Map<String, List<RunningRule>> newRunningContext = new HashMap<>();

newRules.getRules().forEach(rule -> {
/*
* If there is already an alarm rule that is the same as the new one, we'll reuse its
* corresponding runningRule, to keep its history metrics
*/
RunningRule runningRule = alarmRuleRunningRuleMap.getOrDefault(rule, new RunningRule(rule));

newAlarmRuleRunningRuleMap.put(rule, runningRule);

String metricsName = rule.getMetricsName();

List<RunningRule> runningRules = newRunningContext.computeIfAbsent(metricsName, key -> new ArrayList<>());

runningRules.add(runningRule);
});

this.rules = newRules;
this.runningContext = newRunningContext;
this.alarmRuleRunningRuleMap = newAlarmRuleRunningRuleMap;
}

@Override
public String value() {
return settingsString;
}

public List<AlarmRule> getRules() {
return this.rules.getRules();
}

public List<String> getWebHooks() {
return this.rules.getWebhooks();
}
}
Expand Up @@ -33,11 +33,11 @@ public class NotifyHandler implements MetricsNotify {
private EndpointInventoryCache endpointInventoryCache;

private final AlarmCore core;
private final Rules rules;
private final AlarmRulesWatcher alarmRulesWatcher;

public NotifyHandler(Rules rules) {
this.rules = rules;
core = new AlarmCore(rules);
public NotifyHandler(AlarmRulesWatcher alarmRulesWatcher) {
this.alarmRulesWatcher = alarmRulesWatcher;
core = new AlarmCore(alarmRulesWatcher);
}

@Override public void notify(Metrics metrics) {
Expand Down Expand Up @@ -95,11 +95,8 @@ public NotifyHandler(Rules rules) {
}

public void init(AlarmCallback... callbacks) {
List<AlarmCallback> allCallbacks = new ArrayList<>();
for (AlarmCallback callback : callbacks) {
allCallbacks.add(callback);
}
allCallbacks.add(new WebhookCallback(rules.getWebhooks()));
List<AlarmCallback> allCallbacks = new ArrayList<>(Arrays.asList(callbacks));
allCallbacks.add(new WebhookCallback(alarmRulesWatcher));
core.start(allCallbacks);
}

Expand Down
Expand Up @@ -18,15 +18,11 @@

package org.apache.skywalking.oap.server.core.alarm.provider;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm;
import org.apache.skywalking.oap.server.core.analysis.metrics.*;
import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.IntValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics;
import org.apache.skywalking.oap.server.library.util.CollectionUtils;
import org.joda.time.LocalDateTime;
Expand All @@ -36,6 +32,13 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock;

/**
* RunningRule represents each rule in running status. Based on the {@link AlarmRule} definition,
*
Expand Down Expand Up @@ -157,6 +160,8 @@ public List<AlarmMessage> check() {
return alarmMessageList;
}



/**
* A metrics window, based on {@link AlarmRule#period}. This window slides with time, just keeps the recent
* N(period) buckets.
Expand Down

0 comments on commit 2801de7

Please sign in to comment.