-
Notifications
You must be signed in to change notification settings - Fork 8
/
rds.go
258 lines (219 loc) · 9.72 KB
/
rds.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// Copyright 2022 Block, Inc.
package awsrds
import (
"context"
"fmt"
"strings"
"time"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/cloudwatch"
"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
"github.com/cashapp/blip"
)
const (
DOMAIN = "aws.rds"
OPT_DB_ID = "db-id"
)
type CloudWatchClient interface {
GetMetricData(ctx context.Context, params *cloudwatch.GetMetricDataInput, optFns ...func(*cloudwatch.Options)) (*cloudwatch.GetMetricDataOutput, error)
}
func NewCloudWatchClient(awsConfig aws.Config) *cloudwatch.Client {
return cloudwatch.NewFromConfig(awsConfig)
}
var (
rdsNamespace = aws.String("AWS/RDS")
rdsAverage = aws.String("Average")
rdsDbId = aws.String("DBInstanceIdentifier")
rds60s = aws.Int32(60)
)
// https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/MonitoringOverview.html#rds-metrics
// https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/monitoring-cloudwatch.html#rds-metrics
// We only collect the average becuase there should only be 1 sample because
// we collect at max resolution for CloudWatch Metrics: 1 minute. We check the
// sample size and warn if there's >1. So whereas the API call is supposed to
// return statitical values, we're actually getting the raw per-minute data points,
// and we let SignalFx do aggregation/stats.
// RDS collects basic RDS metrics: https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/MonitoringOverview.html#rds-metrics
// Enhanced metrics are not collected yet because they're actually logged to
// CloudWatch Logs, not the CloudWatch Metrics API. So handling that is nontrivial.
type RDS struct {
client CloudWatchClient
// --
dbId string
monitorId string
atLevel map[string]*cloudwatch.GetMetricDataInput // keyed on level
latestTs map[string]map[string]time.Time // keyed on level => metric
}
func NewRDS(client CloudWatchClient) *RDS {
return &RDS{
client: client,
atLevel: map[string]*cloudwatch.GetMetricDataInput{},
latestTs: map[string]map[string]time.Time{},
}
}
func (c *RDS) Domain() string {
return DOMAIN
}
func (c *RDS) Help() blip.CollectorHelp {
return blip.CollectorHelp{
Domain: DOMAIN,
Description: "Amazon RDS metrics like 'CPUUtilization' and 'FreeableMemory'",
Options: map[string]blip.CollectorHelpOption{
OPT_DB_ID: {
Name: OPT_DB_ID,
Desc: "Database instance identifier",
Default: "%%{monitor.id}",
},
},
}
}
func (m *RDS) Prepare(ctx context.Context, plan blip.Plan) (func(), error) {
m.monitorId = plan.MonitorId
LEVEL:
for _, level := range plan.Levels {
dom, ok := level.Collect[DOMAIN]
if !ok {
continue LEVEL // not collected in this level
}
m.dbId = dom.Options[OPT_DB_ID]
if m.dbId == "" {
m.dbId = plan.MonitorId
}
metrics := make([]types.MetricDataQuery, len(dom.Metrics))
if len(metrics) == 0 {
return nil, fmt.Errorf("at %s/%s/aws.rds: no metrics specified; expected at least 1 metric (BinLogDiskUsage, CPUUtilization, etc.)",
plan.Name, level.Name)
}
m.latestTs[level.Name] = map[string]time.Time{}
for i, metric := range dom.Metrics {
m.latestTs[level.Name][metric] = time.Time{}
metrics[i] = types.MetricDataQuery{
Id: aws.String(strings.ToLower(metric)), // must match /^[a-z][a-zA-Z0-9_]*$/
MetricStat: &types.MetricStat{
Stat: rdsAverage,
Period: rds60s, // max resolution for CloudWatch Metrics
Metric: &types.Metric{
MetricName: aws.String(metric),
Namespace: rdsNamespace,
Dimensions: []types.Dimension{
{
Name: rdsDbId,
Value: &m.dbId,
},
},
},
},
}
}
m.atLevel[level.Name] = &cloudwatch.GetMetricDataInput{
//StartTime: &begin, // set in Collect
//EndTime: &now, // set in Collect
ScanBy: types.ScanByTimestampAscending,
MetricDataQueries: metrics,
}
}
return nil, nil
}
func (m *RDS) Collect(ctx context.Context, levelName string) ([]blip.MetricValue, error) {
// Every minute, request the last _2 minutes_ for each metric.
// This is not a typo, it's because RDS is slow and weird wrt metrics.
// Most metrics trail by about ~1 min, so we have to trail too, but
// at least one metric (CPUUtilization) seems always to trail by 2 mins.
// Since we fetch the last 2 minutes, each metric can return 0, 1, or 2
// data points. When there's 2 (which is very common), we us only the latest.
// And to make things extra fun, CPUUtilization usually returns an older
// timestamp, e.g. other metrics report at 00:01 but CPUUtilization reports
// at 00:00. This means we have to send two sets of sfx.Metrics per report
// interval, so this map tracks each unique timestamp.
input, ok := m.atLevel[levelName]
if !ok {
return nil, nil
}
now := time.Now()
begin := now.Add(-2 * time.Minute).Round(time.Minute) // see code comment above
input.StartTime = &begin
input.EndTime = &now
output, err := m.client.GetMetricData(ctx, input)
if err != nil {
return nil, err
}
metrics := []blip.MetricValue{} // status vars converted to Blip metrics
for i := range output.MetricDataResults {
r := output.MetricDataResults[i]
for j := range r.Timestamps {
metric := *r.Label
// If AWS ts is not after lastest ts, then it's an old or duplicate value
// that we've already reported; skip it
if !r.Timestamps[j].After(m.latestTs[levelName][*r.Label]) {
blip.Debug("%s: drop: %s %s = %f\n", m.monitorId, r.Timestamps[j], metric, r.Values[j])
continue
}
blip.Debug("%s: keep: %s %s = %f\n", m.monitorId, r.Timestamps[j], metric, r.Values[j])
m.latestTs[levelName][*r.Label] = r.Timestamps[j]
m := blip.MetricValue{
Name: metric,
Type: blip.GAUGE, // almost all RDS metrics are guages
Value: r.Values[j],
Meta: map[string]string{
"ts": fmt.Sprintf("%d", r.Timestamps[j].UnixMilli()), // must be milliseconds
},
}
if isCounter[metric] {
m.Type = blip.COUNTER
}
metrics = append(metrics, m)
}
}
return metrics, nil
}
var isCounter = map[string]bool{
"AbortedClients": true,
"BacktrackWindowAlert": true,
"CPUCreditBalance": true,
"CPUCreditUsage": true,
"EngineUptime": true,
"NumBinaryLogFiles": true,
"RowLockTime": true,
}
/*
Example of CPUUtilization trailing by 2 minutes:
15:25:33.242882 metrics.go:78: [2020-09-30 15:23:33 +0000 UTC] to [2020-09-30 15:25:33 +0000 UTC]
15:25:33.540432 metrics.go:120: FreeStorageSpace at 2020-09-30 15:24:00 +0000 UTC
15:25:33.540470 metrics.go:120: FreeStorageSpace at 2020-09-30 15:23:00 +0000 UTC
15:25:33.613576 metrics.go:120: FreeableMemory at 2020-09-30 15:24:00 +0000 UTC
15:25:33.613614 metrics.go:120: FreeableMemory at 2020-09-30 15:23:00 +0000 UTC
15:25:33.677005 metrics.go:120: BinLogDiskUsage at 2020-09-30 15:24:00 +0000 UTC
15:25:33.677040 metrics.go:120: BinLogDiskUsage at 2020-09-30 15:23:00 +0000 UTC
15:25:33.738875 metrics.go:120: ReadIOPS at 2020-09-30 15:24:00 +0000 UTC
15:25:33.738914 metrics.go:120: ReadIOPS at 2020-09-30 15:23:00 +0000 UTC
15:25:33.787929 metrics.go:120: WriteIOPS at 2020-09-30 15:24:00 +0000 UTC
15:25:33.787965 metrics.go:120: WriteIOPS at 2020-09-30 15:23:00 +0000 UTC
15:25:34.080282 metrics.go:120: CPUUtilization at 2020-09-30 15:23:00 +0000 UTC
15:25:34.131457 metrics.go:115: zero data points for BurstBalance
15:26:33.242886 metrics.go:78: [2020-09-30 15:24:33 +0000 UTC] to [2020-09-30 15:26:33 +0000 UTC]
15:26:33.521758 metrics.go:120: FreeStorageSpace at 2020-09-30 15:25:00 +0000 UTC
15:26:33.521793 metrics.go:120: FreeStorageSpace at 2020-09-30 15:24:00 +0000 UTC
15:26:33.566831 metrics.go:120: FreeableMemory at 2020-09-30 15:25:00 +0000 UTC
15:26:33.566871 metrics.go:120: FreeableMemory at 2020-09-30 15:24:00 +0000 UTC
15:26:33.618570 metrics.go:120: BinLogDiskUsage at 2020-09-30 15:25:00 +0000 UTC
15:26:33.618606 metrics.go:120: BinLogDiskUsage at 2020-09-30 15:24:00 +0000 UTC
15:26:33.663773 metrics.go:120: ReadIOPS at 2020-09-30 15:25:00 +0000 UTC
15:26:33.663823 metrics.go:120: ReadIOPS at 2020-09-30 15:24:00 +0000 UTC
15:26:33.754530 metrics.go:120: WriteIOPS at 2020-09-30 15:25:00 +0000 UTC
15:26:33.754565 metrics.go:120: WriteIOPS at 2020-09-30 15:24:00 +0000 UTC
15:26:34.022653 metrics.go:120: CPUUtilization at 2020-09-30 15:24:00 +0000 UTC
15:26:34.085970 metrics.go:115: zero data points for BurstBalance
15:27:33.242881 metrics.go:78: [2020-09-30 15:25:33 +0000 UTC] to [2020-09-30 15:27:33 +0000 UTC]
15:27:33.494566 metrics.go:120: FreeStorageSpace at 2020-09-30 15:25:00 +0000 UTC
14:27:33.494605 metrics.go:120: FreeStorageSpace at 2020-09-30 15:26:00 +0000 UTC
15:27:33.555669 metrics.go:120: FreeableMemory at 2020-09-30 15:25:00 +0000 UTC
15:27:33.555717 metrics.go:120: FreeableMemory at 2020-09-30 15:26:00 +0000 UTC
15:27:33.608138 metrics.go:120: BinLogDiskUsage at 2020-09-30 15:25:00 +0000 UTC
15:27:33.608188 metrics.go:120: BinLogDiskUsage at 2020-09-30 15:26:00 +0000 UTC
15:27:33.657514 metrics.go:120: ReadIOPS at 2020-09-30 15:26:00 +0000 UTC
15:27:33.657587 metrics.go:120: ReadIOPS at 2020-09-30 15:25:00 +0000 UTC
15:27:33.706865 metrics.go:120: WriteIOPS at 2020-09-30 15:26:00 +0000 UTC
15:27:33.706901 metrics.go:120: WriteIOPS at 2020-09-30 15:25:00 +0000 UTC
15:27:33.934436 metrics.go:120: CPUUtilization at 2020-09-30 15:25:00 +0000 UTC
15:27:33.982811 metrics.go:115: zero data points for BurstBalance
*/