-
Notifications
You must be signed in to change notification settings - Fork 130
/
zombie_instances.go
106 lines (90 loc) · 3.09 KB
/
zombie_instances.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package metrics
import (
"fmt"
"strings"
"time"
"github.com/bazelbuild/continuous-integration/metrics/clients"
"github.com/bazelbuild/continuous-integration/metrics/data"
)
const ciWorkerNamePrefix = "bk-"
type ZombieInstances struct {
computeClient *clients.ComputeEngineClient
cloudProjects []string
bkClient clients.BuildkiteClient
bkOrgs []string
gracePeriod time.Duration
columns []Column
}
func (zi *ZombieInstances) Name() string {
return "zombie_instances"
}
func (zi *ZombieInstances) Columns() []Column {
return zi.columns
}
func (*ZombieInstances) Type() MetricType {
return TimeBasedMetric
}
func (*ZombieInstances) RelevantDelta() int {
return 10 * 60 // 10 minutes in seconds
}
func (zi *ZombieInstances) Collect() (data.DataSet, error) {
agentHostNameIndex, err := zi.getAgentHostNameIndex()
if err != nil {
return nil, fmt.Errorf("Failed to fetch Buildkite agents: %v", err)
}
instances, err := zi.getInstances()
if err != nil {
return nil, fmt.Errorf("Failed to fetch GCE instances: %v", err)
}
result := data.CreateDataSet(GetColumnNames(zi.columns))
for _, instance := range instances {
if _, ok := agentHostNameIndex[instance.Name]; ok {
// Agent is up and running
continue
}
if instance.Status == "STOPPING" {
continue
}
onlineTime := time.Since(instance.CreationTime)
if onlineTime < zi.gracePeriod {
// VM was started only very recently
continue
}
err = result.AddRow(instance.Project, instance.Zone, instance.Name, instance.Status, onlineTime.Seconds(), time.Now())
if err != nil {
return nil, err
}
}
return result, nil
}
func (zi *ZombieInstances) getInstances() ([]*clients.ComputeInstance, error) {
ciInstances := make([]*clients.ComputeInstance, 0)
allInstances, err := zi.computeClient.GetAllInstances(zi.cloudProjects)
if err != nil {
return nil, err
}
for _, instance := range allInstances {
if strings.HasPrefix(instance.Name, ciWorkerNamePrefix) {
ciInstances = append(ciInstances, instance)
}
}
return ciInstances, nil
}
func (zi *ZombieInstances) getAgentHostNameIndex() (map[string]bool, error) {
hostNameIndex := make(map[string]bool)
for _, org := range zi.bkOrgs {
agents, err := zi.bkClient.GetAgents(org)
if err != nil {
return nil, err
}
for _, agent := range agents {
hostNameIndex[*agent.Hostname] = false
}
}
return hostNameIndex, nil
}
// CREATE TABLE zombie_instances (cloud_project VARCHAR(255), zone VARCHAR(255), instance VARCHAR(255), status VARCHAR(255), seconds_online FLOAT, timestamp DATETIME, PRIMARY KEY(cloud_project, zone, instance));
func CreateZombieInstances(computeClient *clients.ComputeEngineClient, cloudProjects []string, bkClient clients.BuildkiteClient, bkOrgs []string, gracePeriod time.Duration) *ZombieInstances {
columns := []Column{Column{"cloud_project", true}, Column{"zone", true}, Column{"instance", true}, Column{"status", false}, Column{"seconds_online", false}, Column{"timestamp", false}}
return &ZombieInstances{computeClient: computeClient, cloudProjects: cloudProjects, bkClient: bkClient, bkOrgs: bkOrgs, columns: columns, gracePeriod: gracePeriod}
}