-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
alerts.json
161 lines (161 loc) · 5.51 KB
/
alerts.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
{
"AMS": {
"service": [
{
"name": "ams_metric_monitor_process_percent",
"label": "Percent Metric Monitors Available",
"description": "This alert is triggered if a percentage of Metric Monitor processes are not up and listening on the network for the configured warning and critical thresholds.",
"interval": 1,
"scope": "SERVICE",
"enabled": true,
"source": {
"type": "AGGREGATE",
"alert_name": "ams_metric_monitor_process",
"reporting": {
"ok": {
"text": "affected: [{1}], total: [{0}]"
},
"warning": {
"text": "affected: [{1}], total: [{0}]",
"value": 0.1
},
"critical": {
"text": "affected: [{1}], total: [{0}]",
"value": 0.3
}
}
}
}
],
"METRIC_COLLECTOR": [
{
"name": "ams_metric_collector_process",
"label": "Metric Collector Process",
"description": "This alert is triggered if the Metric Collector cannot be confirmed to be up and listening on the configured port for number of seconds equal to threshold.",
"interval": 1,
"scope": "ANY",
"enabled": true,
"source": {
"type": "PORT",
"uri": "{{ams-site/timeline.metrics.service.webapp.address}}",
"default_port": 6188,
"reporting": {
"ok": {
"text": "TCP OK - {0:.3f}s response on port {1}"
},
"warning": {
"text": "TCP OK - {0:.3f}s response on port {1}",
"value": 1.5
},
"critical": {
"text": "Connection failed: {0} to {1}:{2}",
"value": 5.0
}
}
}
},
{
"name": "ams_metric_collector_hbase_master_process",
"label": "Metric Collector - HBase Master Process",
"description": "This alert is triggered if the Metric Collector's HBase master processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
"interval": 1,
"scope": "ANY",
"source": {
"type": "PORT",
"uri": "{{ams-hbase-site/hbase.master.info.port}}",
"default_port": 61310,
"reporting": {
"ok": {
"text": "TCP OK - {0:.3f}s response on port {1}"
},
"warning": {
"text": "TCP OK - {0:.3f}s response on port {1}",
"value": 1.5
},
"critical": {
"text": "Connection failed: {0} to {1}:{2}",
"value": 5.0
}
}
}
},
{
"name": "ams_metric_collector_hbase_master_cpu",
"label": "Metric Collector HBase Maser CPU Utilization",
"description": "This host-level alert is triggered if CPU utilization of the Metric Collector's HBase Master exceeds certain warning and critical thresholds. It checks the HBase Master JMX Servlet for the SystemCPULoad property. The threshold values are in percent.",
"interval": 5,
"scope": "ANY",
"enabled": true,
"source": {
"type": "METRIC",
"uri": {
"http": "{{ams-hbase-site/hbase.master.info.port}}",
"https": "{{ams-hbase-site/hbase.master.info.port}}",
"https_property": "{{cluster-env/security_enabled}}",
"https_property_value": "true",
"default_port": 61310
},
"reporting": {
"ok": {
"text": "{1} CPU, load {0:.1%}"
},
"warning": {
"text": "{1} CPU, load {0:.1%}",
"value": 200
},
"critical": {
"text": "{1} CPU, load {0:.1%}",
"value": 250
},
"units" : "%"
},
"jmx": {
"property_list": [
"java.lang:type=OperatingSystem/SystemCpuLoad",
"java.lang:type=OperatingSystem/AvailableProcessors"
],
"value": "{0} * 100"
}
}
},
{
"name": "ams_metric_collector_zookeeper_server_process",
"label": "Metric Collector - ZooKeeper Server Process",
"description": "This host-level alert is triggered if the Metric Collector's ZooKeeper server process cannot be determined to be up and listening on the network.",
"interval": 1,
"scope": "ANY",
"source": {
"type": "PORT",
"uri": "{{ams-hbase-site/hbase.zookeeper.property.clientPort}}",
"default_port": 61181,
"reporting": {
"ok": {
"text": "TCP OK - {0:.3f}s response on port {1}"
},
"warning": {
"text": "TCP OK - {0:.3f}s response on port {1}",
"value": 1.5
},
"critical": {
"text": "Connection failed: {0} to {1}:{2}",
"value": 5.0
}
}
}
}
],
"METRIC_MONITOR": [
{
"name": "ams_metric_monitor_process",
"label": "Metric Monitor Status",
"description": "This alert indicates the status of the Metric Monitor process as determined by the monitor status script.",
"interval": 1,
"scope": "ANY",
"source": {
"type": "SCRIPT",
"path": "AMS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py"
}
}
]
}
}