-
Notifications
You must be signed in to change notification settings - Fork 26
/
monitor.go
184 lines (152 loc) · 4.87 KB
/
monitor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
package monitor
import (
"context"
"encoding/base64"
"errors"
"fmt"
"sort"
"time"
"github.com/Khan/genqlient/graphql"
"github.com/buildkite/agent-stack-k8s/v2/api"
"github.com/buildkite/agent-stack-k8s/v2/internal/controller/agenttags"
"go.uber.org/zap"
"k8s.io/client-go/kubernetes"
)
type Monitor struct {
gql graphql.Client
logger *zap.Logger
cfg Config
}
type Config struct {
Namespace string
Token string
ClusterUUID string
MaxInFlight int
Org string
Tags []string
}
type JobHandler interface {
Create(context.Context, *api.CommandJob) error
}
func New(logger *zap.Logger, k8s kubernetes.Interface, cfg Config) (*Monitor, error) {
graphqlClient := api.NewClient(cfg.Token)
return &Monitor{
gql: graphqlClient,
logger: logger,
cfg: cfg,
}, nil
}
// jobResp is used to identify the response types from methods that call the GraphQL API
// in the cases where a cluster is specified or otherwise.
// The return types are are isomorphic, but this has been lost in the generation of the
// API calling methods. As such, the implementations should be syntacticaly identical, but
// semantically, they operate on different types.
type jobResp interface {
OrganizationExists() bool
CommandJobs() []*api.JobJobTypeCommand
}
type unclusteredJobResp api.GetScheduledJobsResponse
func (r unclusteredJobResp) OrganizationExists() bool {
return r.Organization.Id != nil
}
func (r unclusteredJobResp) CommandJobs() []*api.JobJobTypeCommand {
jobs := make([]*api.JobJobTypeCommand, 0, len(r.Organization.Jobs.Edges))
for _, edge := range r.Organization.Jobs.Edges {
jobs = append(jobs, edge.Node.(*api.JobJobTypeCommand))
}
return jobs
}
type clusteredJobResp api.GetScheduledJobsClusteredResponse
func (r clusteredJobResp) OrganizationExists() bool {
return r.Organization.Id != nil
}
func (r clusteredJobResp) CommandJobs() []*api.JobJobTypeCommand {
jobs := make([]*api.JobJobTypeCommand, 0, len(r.Organization.Jobs.Edges))
for _, edge := range r.Organization.Jobs.Edges {
jobs = append(jobs, edge.Node.(*api.JobJobTypeCommand))
}
return jobs
}
// getScheduledCommandJobs calls either the clustered or unclustered GraphQL API
// methods, depending on if a cluster uuid was provided in the config
func (m *Monitor) getScheduledCommandJobs(ctx context.Context, queue string) (jobResp, error) {
if m.cfg.ClusterUUID == "" {
resp, err := api.GetScheduledJobs(ctx, m.gql, m.cfg.Org, []string{fmt.Sprintf("queue=%s", queue)})
return unclusteredJobResp(*resp), err
}
resp, err := api.GetScheduledJobsClustered(
ctx, m.gql, m.cfg.Org, []string{fmt.Sprintf("queue=%s", queue)}, encodeClusterGraphQLID(m.cfg.ClusterUUID),
)
return clusteredJobResp(*resp), err
}
func toMapAndLogErrors(logger *zap.Logger, tags []string) map[string]string {
agentTags, tagErrs := agenttags.ToMap(tags)
if len(tagErrs) != 0 {
logger.Warn("making a map of agent tags", zap.Errors("err", tagErrs))
}
return agentTags
}
func (m *Monitor) Start(ctx context.Context, handler JobHandler) <-chan error {
logger := m.logger.With(zap.String("org", m.cfg.Org))
errs := make(chan error, 1)
agentTags := toMapAndLogErrors(logger, m.cfg.Tags)
var queue string
var ok bool
if queue, ok = agentTags["queue"]; !ok {
errs <- errors.New("missing required tag: queue")
return errs
}
go func() {
logger.Info("started")
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
first := make(chan struct{}, 1)
first <- struct{}{}
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
case <-first:
}
resp, err := m.getScheduledCommandJobs(ctx, queue)
if err != nil {
// Avoid logging if the context is already closed.
if ctx.Err() != nil {
return
}
logger.Warn("failed to get scheduled command jobs", zap.Error(err))
continue
}
if !resp.OrganizationExists() {
errs <- fmt.Errorf("invalid organization: %q", m.cfg.Org)
return
}
jobs := resp.CommandJobs()
// TODO: sort by ScheduledAt in the API
sort.Slice(jobs, func(i, j int) bool {
return jobs[i].ScheduledAt.Before(jobs[j].ScheduledAt)
})
for _, job := range jobs {
jobTags := toMapAndLogErrors(logger, job.AgentQueryRules)
// The api returns jobs that match ANY agent tags (the agent query rules)
// However, we can only acquire jobs that match ALL agent tags
if !agenttags.JobTagsMatchAgentTags(jobTags, agentTags) {
logger.Debug("skipping job because it did not match all tags", zap.Any("job", job))
continue
}
logger.Debug("creating job", zap.String("uuid", job.Uuid))
if err := handler.Create(ctx, &job.CommandJob); err != nil {
if ctx.Err() != nil {
return
}
logger.Error("failed to create job", zap.Error(err))
}
}
}
}()
return errs
}
func encodeClusterGraphQLID(clusterUUID string) string {
return base64.StdEncoding.EncodeToString([]byte("Cluster---" + clusterUUID))
}