/
main.go
345 lines (323 loc) · 10.1 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
package main
import (
"bufio"
"fmt"
"log"
"os"
"os/exec"
"os/signal"
"strconv"
"strings"
"syscall"
"text/template"
"time"
"docker.io/go-docker/api/types"
"docker.io/go-docker/api/types/filters"
"github.com/appcelerator/amp/pkg/docker"
"github.com/spf13/cobra"
"golang.org/x/net/context"
)
const (
defaultTemplate = "/etc/prometheus/prometheus.tpl"
defaultConfiguration = "/etc/prometheus/prometheus.yml"
defaultPeriod = 1
defaultMonitoringNetwork = "monit"
defaultStackName = "amp"
dockerForMacIP = "192.168.65.1"
prometheusCmd = "/bin/prometheus"
metricsPortLabel = "io.amp.metrics.port"
metricsPathLabel = "io.amp.metrics.path"
metricsModeLabel = "io.amp.metrics.mode"
metricsDropLabel = "io.amp.metrics.drop"
metricsModeTasks = "tasks"
metricsModeExporter = "exporter"
externalURLEnv = "PROMETHEUS_EXTERNAL_URL"
externalURLOption = "--web.external-url"
metaLabelName = "__name__"
relabelActionDrop = "drop"
)
var (
monitoringNetwork string
stackName string
)
var prometheusArgs = []string{
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/prometheus",
"--storage.tsdb.retention=15d",
"--web.console.libraries=/usr/share/prometheus/console_libraries",
"--web.console.templates=/usr/share/prometheus/consoles",
"--storage.tsdb.no-lockfile",
}
type Inventory struct {
Jobs []Job
}
type Job struct {
Name string
Mode string
StaticConfigs []StaticConfig
RelabelConfigs []RelabelConfig
MetricsRelabelConfigs []RelabelConfig
MetricsPath string
}
// static config for a prometheus job
type StaticConfig struct {
Target string
Port int
Labels map[string]string
}
type RelabelConfig struct {
SourceLabels []string
Separator string
TargetLabel string
Replacement string
Regex string
Action string
}
type Target struct {
Name string
Port int
MetricsPath string
}
// discovers services with the io.amp.metrics.port label
// get the name and host IP of the tasks of the services
func prepareJobs(client *docker.Docker, networkResource types.NetworkResource) ([]Job, error) {
var jobs []Job
filter := filters.NewArgs()
filter.Add("label", metricsPortLabel)
// only available on manager nodes
services, err := client.GetClient().ServiceList(context.Background(), types.ServiceListOptions{Filters: filter})
if err != nil {
return nil, err
}
if len(services) == 0 {
fmt.Println("Warning: no service discovered for monitoring")
}
for _, service := range services {
name := service.Spec.Annotations.Name
strMetricsPort, ok := service.Spec.Annotations.Labels[metricsPortLabel]
if !ok {
fmt.Printf("Warning: unable to get metrics port label for service %s, ignoring it\n", name)
continue
}
metricsPort, err := strconv.Atoi(strMetricsPort)
if err != nil {
log.Printf("Warning: non numerical port for service %s: %s\n", name, strMetricsPort)
continue
}
metricsPath, ok := service.Spec.Annotations.Labels[metricsPathLabel]
if !ok {
metricsPath = "/metrics"
}
// mode can be tasks or exporter
metricsMode, ok := service.Spec.Annotations.Labels[metricsModeLabel]
if !ok {
metricsMode = metricsModeTasks
}
// metrics to ignore
metricsToDrop, ok := service.Spec.Annotations.Labels[metricsDropLabel]
if !ok {
metricsToDrop = ""
}
fmt.Printf("discovered service %s on port %d and path %s, mode %s\n", name, metricsPort, metricsPath, metricsMode)
s, ok := networkResource.Services[name]
if !ok {
fmt.Printf("Warning: service %s not found in network %s, ignoring it\n", name, monitoringNetwork)
continue
}
switch metricsMode {
default:
fmt.Printf("Warning: wrong metrics mode (%s) for service %s, force it to %s\n", metricsMode, name, metricsModeTasks)
metricsMode = metricsModeTasks
fallthrough
case metricsModeTasks:
if len(s.Tasks) == 0 {
continue
}
job := Job{Name: strings.TrimPrefix(name, fmt.Sprintf("%s_", stackName)), Mode: metricsModeTasks, MetricsPath: metricsPath}
for _, task := range s.Tasks {
job.StaticConfigs = append(job.StaticConfigs, StaticConfig{
Target: task.EndpointIP,
Port: metricsPort,
Labels: map[string]string{
"hostip": task.Info["Host IP"],
"taskname": task.Name,
},
})
}
// all "tasks" jobs have the same relabel config
job.RelabelConfigs = append(job.RelabelConfigs,
RelabelConfig{SourceLabels: []string{"hostip"}, Separator: "@", TargetLabel: "instance"})
if metricsToDrop != "" {
job.MetricsRelabelConfigs = append(job.MetricsRelabelConfigs,
RelabelConfig{SourceLabels: []string{metaLabelName}, Regex: metricsToDrop, Action: relabelActionDrop})
}
jobs = append(jobs, job)
case metricsModeExporter:
shortName := strings.TrimSuffix(strings.TrimPrefix(name, fmt.Sprintf("%s_", stackName)), "_exporter")
job := Job{Name: shortName, Mode: metricsModeExporter, MetricsPath: metricsPath}
job.StaticConfigs = []StaticConfig{
{
Target: name,
Port: metricsPort,
},
}
job.RelabelConfigs = append(job.RelabelConfigs,
RelabelConfig{Replacement: shortName, TargetLabel: "instance"})
if metricsToDrop != "" {
job.MetricsRelabelConfigs = append(job.MetricsRelabelConfigs,
RelabelConfig{SourceLabels: []string{metaLabelName}, Regex: metricsToDrop, Action: relabelActionDrop})
}
jobs = append(jobs, job)
}
}
return jobs, nil
}
func update(client *docker.Docker, configurationTemplate string, configuration string) error {
var configurationFile *os.File
// connect to the engine API
if err := client.Connect(); err != nil {
return err
}
filter := filters.NewArgs()
filter.Add("name", monitoringNetwork)
networkResources, err := client.GetClient().NetworkList(context.Background(), types.NetworkListOptions{Filters: filter})
if err != nil {
return err
}
if len(networkResources) != 1 {
return fmt.Errorf("network lookup failed (%s)", monitoringNetwork)
}
networkId := networkResources[0].ID
networkResource, err := client.GetClient().NetworkInspect(context.Background(), networkId, types.NetworkInspectOptions{Verbose: true})
jobs, err := prepareJobs(client, networkResource)
if err != nil {
return err
}
inventory := &Inventory{Jobs: jobs}
// prepare the configuration
t := template.Must(template.New("prometheus.tpl").Funcs(template.FuncMap{"StringsJoin": strings.Join}).ParseFiles(configurationTemplate))
configurationFile, err = os.Create(configuration)
if err != nil {
return err
}
err = t.Execute(configurationFile, inventory)
if err != nil {
return err
}
configurationFile.Close()
// reload prometheus
cmd := exec.Command("/usr/bin/killall", "-HUP", "prometheus")
err = cmd.Run()
if err != nil {
log.Println("Prometheus reload failed, error message follows")
return err
}
return nil
}
// am I a manager?
func isAManager(client *docker.Docker) (bool, error) {
if err := client.Connect(); err != nil {
return false, err
}
info, err := client.GetClient().Info(context.Background())
if err != nil {
return false, err
}
nodeId := info.Swarm.NodeID
for _, peer := range info.Swarm.RemoteManagers {
if peer.NodeID == nodeId {
return true, nil
}
}
return false, nil
}
func main() {
var configuration string
var configurationTemplate string
var period int32
var RootCmd = &cobra.Command{
Use: "promctl",
Short: "Prometheus controller",
Long: `Keep the Prometheus configuration up to date with swarm discovery`,
RunE: func(cmd *cobra.Command, args []string) error {
client := docker.NewEnvClient()
manager, err := isAManager(client)
if !manager || err != nil {
fmt.Printf("Docker Swarm manager connection failed: %s\n", err.Error())
return fmt.Errorf("service discovery requires a connection to a manager engine socket")
}
stop := make(chan os.Signal, 1)
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
tick := time.Tick(time.Duration(period) * time.Minute)
time.Sleep(5 * time.Second)
if err := update(client, configurationTemplate, configuration); err != nil {
return err
}
loop:
for {
select {
case <-tick:
if err := update(client, configurationTemplate, configuration); err != nil {
fmt.Println(err.Error())
}
case sig := <-stop:
log.Printf("%v signal trapped\n", sig)
break loop
}
}
log.Println("Stopping Prometheus")
stopCmd := exec.Command("/usr/bin/killall", "prometheus")
if err := stopCmd.Run(); err != nil {
log.Println("unable to stop Prometheus, error message follows")
return err
}
return nil
},
}
RootCmd.PersistentFlags().StringVarP(&configuration, "config", "c", defaultConfiguration, "config file")
RootCmd.PersistentFlags().StringVarP(&configurationTemplate, "template", "t", defaultTemplate, "template file")
RootCmd.PersistentFlags().Int32VarP(&period, "period", "p", defaultPeriod, "reload period in minute")
RootCmd.PersistentFlags().StringVarP(&monitoringNetwork, "network", "n", defaultMonitoringNetwork, "Overlay network for monitoring")
RootCmd.PersistentFlags().StringVarP(&stackName, "stack-name", "s", defaultStackName, "Stack name (for prefix trimming)")
// Set Prometheus external URL if provided
if url := os.Getenv(externalURLEnv); url != "" {
prometheusArgs = append(prometheusArgs, fmt.Sprintf("%s=%s", externalURLOption, url))
}
// start Prometheus
proc := exec.Command(prometheusCmd, prometheusArgs...)
stdout, err := proc.StdoutPipe()
if err != nil {
log.Fatalln(err)
}
stderr, err := proc.StderrPipe()
if err != nil {
log.Fatalln(err)
}
outscanner := bufio.NewScanner(stdout)
errscanner := bufio.NewScanner(stderr)
go func() {
for outscanner.Scan() {
fmt.Println(outscanner.Text())
}
}()
go func() {
for errscanner.Scan() {
fmt.Println(errscanner.Text())
}
}()
go func() {
err := proc.Start()
if err != nil {
log.Fatalln(err)
}
err = proc.Wait()
log.Println("Prometheus has exited")
if err != nil {
log.Fatalln(err)
}
// TODO: process is terminated without error, we should probably also stop the root cmd
}()
if err := RootCmd.Execute(); err != nil {
log.Fatalln(err)
}
}