Skip to content

Commit

Permalink
feat: add deprovisioning metrics for eligible machines (#331)
Browse files Browse the repository at this point in the history
* feat: add metrics for deprovisioning for each deprovisioner

* add string functions and populate metric call

* rebase

* remove naming

* fix names

* comments

* revert to histogram

* revert label

* inc

* add expiration
  • Loading branch information
njtran authored and bwagner5 committed May 16, 2023
1 parent f18b8e4 commit fd3f7d1
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 30 deletions.
1 change: 0 additions & 1 deletion pkg/controllers/deprovisioning/consolidation.go
Expand Up @@ -104,7 +104,6 @@ func (c *consolidation) ShouldDeprovision(_ context.Context, cn *Candidate) bool
//
// nolint:gocyclo
func (c *consolidation) computeConsolidation(ctx context.Context, candidates ...*Candidate) (Command, error) {
defer metrics.Measure(deprovisioningDurationHistogram.WithLabelValues("Replace/Delete"))()
// Run scheduling simulation to compute consolidation option
results, err := simulateScheduling(ctx, c.kubeClient, c.cluster, c.provisioner, candidates...)
if err != nil {
Expand Down
7 changes: 6 additions & 1 deletion pkg/controllers/deprovisioning/controller.go
Expand Up @@ -144,6 +144,7 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc
}

func (c *Controller) deprovision(ctx context.Context, deprovisioner Deprovisioner) (bool, error) {
defer metrics.Measure(deprovisioningDurationHistogram.WithLabelValues(deprovisioner.String()))()
candidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.clock, c.cloudProvider, deprovisioner.ShouldDeprovision)
if err != nil {
return false, fmt.Errorf("determining candidates, %w", err)
Expand Down Expand Up @@ -171,7 +172,11 @@ func (c *Controller) deprovision(ctx context.Context, deprovisioner Deprovisione
}

func (c *Controller) executeCommand(ctx context.Context, d Deprovisioner, command Command) error {
deprovisioningActionsPerformedCounter.With(prometheus.Labels{"action": fmt.Sprintf("%s/%s", d, command.action)}).Add(1)
deprovisioningActionsPerformedCounter.With(map[string]string{
// TODO: make this just command.Action() since we've added the deprovisioner as its own label.
actionLabel: fmt.Sprintf("%s/%s", d, command.action),
deprovisionerLabel: d.String(),
}).Inc()
logging.FromContext(ctx).Infof("deprovisioning via %s %s", d, command)

reason := fmt.Sprintf("%s/%s", d, command.action)
Expand Down
1 change: 1 addition & 0 deletions pkg/controllers/deprovisioning/drift.go
Expand Up @@ -62,6 +62,7 @@ func (d *Drift) ComputeCommand(ctx context.Context, nodes ...*Candidate) (Comman
if err != nil {
return Command{}, fmt.Errorf("filtering candidates, %w", err)
}
deprovisioningEligibleMachinesGauge.WithLabelValues(d.String()).Set(float64(len(candidates)))

for _, candidate := range candidates {
// Check if we need to create any machines.
Expand Down
1 change: 1 addition & 0 deletions pkg/controllers/deprovisioning/emptiness.go
Expand Up @@ -66,6 +66,7 @@ func (e *Emptiness) ComputeCommand(_ context.Context, candidates ...*Candidate)
emptyCandidates := lo.Filter(candidates, func(cn *Candidate, _ int) bool {
return cn.Node.DeletionTimestamp.IsZero() && len(cn.pods) == 0
})
deprovisioningEligibleMachinesGauge.WithLabelValues(e.String()).Set(float64(len(candidates)))

if len(emptyCandidates) == 0 {
return Command{action: actionDoNothing}, nil
Expand Down
Expand Up @@ -49,6 +49,7 @@ func (c *EmptyMachineConsolidation) ComputeCommand(ctx context.Context, candidat
if err != nil {
return Command{}, fmt.Errorf("sorting candidates, %w", err)
}
deprovisioningEligibleMachinesGauge.WithLabelValues(c.String()).Set(float64(len(candidates)))

// select the entirely empty nodes
emptyCandidates := lo.Filter(candidates, func(n *Candidate, _ int) bool { return len(n.pods) == 0 })
Expand Down
1 change: 1 addition & 0 deletions pkg/controllers/deprovisioning/expiration.go
Expand Up @@ -83,6 +83,7 @@ func (e *Expiration) ComputeCommand(ctx context.Context, nodes ...*Candidate) (C
if err != nil {
return Command{}, fmt.Errorf("filtering candidates, %w", err)
}
deprovisioningEligibleMachinesGauge.WithLabelValues(e.String()).Set(float64(len(candidates)))

for _, candidate := range candidates {
// Check if we need to create any nodes.
Expand Down
69 changes: 41 additions & 28 deletions pkg/controllers/deprovisioning/metrics.go
Expand Up @@ -25,36 +25,49 @@ func init() {
crmetrics.Registry.MustRegister(deprovisioningDurationHistogram)
crmetrics.Registry.MustRegister(deprovisioningReplacementNodeInitializedHistogram)
crmetrics.Registry.MustRegister(deprovisioningActionsPerformedCounter)
crmetrics.Registry.MustRegister(deprovisioningEligibleMachinesGauge)
}

const deprovisioningSubsystem = "deprovisioning"

var deprovisioningDurationHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "evaluation_duration_seconds",
Help: "Duration of the deprovisioning evaluation process in seconds.",
Buckets: metrics.DurationBuckets(),
},
[]string{"method"},
const (
deprovisioningSubsystem = "deprovisioning"
deprovisionerLabel = "deprovisioner"
actionLabel = "action"
)

var deprovisioningReplacementNodeInitializedHistogram = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "replacement_node_initialized_seconds",
Help: "Amount of time required for a replacement node to become initialized.",
Buckets: metrics.DurationBuckets(),
})

var deprovisioningActionsPerformedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "actions_performed",
Help: "Number of deprovisioning actions performed. Labeled by action.",
},
[]string{"action"},
var (
deprovisioningDurationHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "evaluation_duration_seconds",
Help: "Duration of the deprovisioning evaluation process in seconds.",
Buckets: metrics.DurationBuckets(),
},
[]string{"method"})
deprovisioningReplacementNodeInitializedHistogram = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "replacement_node_initialized_seconds",
Help: "Amount of time required for a replacement node to become initialized.",
Buckets: metrics.DurationBuckets(),
})
deprovisioningActionsPerformedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "actions_performed",
Help: "Number of deprovisioning actions performed. Labeled by deprovisioner.",
},
[]string{actionLabel, deprovisionerLabel},
)
deprovisioningEligibleMachinesGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "eligible_machines",
Help: "Number of machines eligible for deprovisioning by Karpenter. Labeled by deprovisioner",
},
[]string{deprovisionerLabel},
)
)
Expand Up @@ -47,6 +47,7 @@ func (m *MultiMachineConsolidation) ComputeCommand(ctx context.Context, candidat
if err != nil {
return Command{}, fmt.Errorf("sorting candidates, %w", err)
}
deprovisioningEligibleMachinesGauge.WithLabelValues(m.String()).Set(float64(len(candidates)))

// For now, we will consider up to every machine in the cluster, might be configurable in the future.
maxParallel := len(candidates)
Expand Down
Expand Up @@ -48,6 +48,7 @@ func (c *SingleMachineConsolidation) ComputeCommand(ctx context.Context, candida
if err != nil {
return Command{}, fmt.Errorf("sorting candidates, %w", err)
}
deprovisioningEligibleMachinesGauge.WithLabelValues(c.String()).Set(float64(len(candidates)))

v := NewValidation(consolidationTTL, c.clock, c.cluster, c.kubeClient, c.provisioner, c.cloudProvider, c.recorder)
for _, candidate := range candidates {
Expand Down

0 comments on commit fd3f7d1

Please sign in to comment.