Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,13 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
// Filter into untainted and tainted nodes
untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes)

// Determine which nodes count toward capacity for utilisation calculation
capacityNodes := untaintedNodes
if nodeGroup.Opts.IncludeTaintedInCapacity {
capacityNodes = append(append([]*v1.Node{}, untaintedNodes...), taintedNodes...)
Copy link
Copy Markdown
Contributor

@tomwwright tomwwright Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just thinking through the premise of the fix, does it make more sense to instead change the calculation on the numerator side?

That is, rather than including tainted nodes in the available capacity (which really, they aren't, as the scheduler can't place on them) we could exclude pods on tainted nodes from the load (which makes some sense, they are draining pods)

The crux of the utilisation calc is to determine what % pressure is on the pool of resources that are available to the scheduler

The concern could be that computing that exclusion could be expensive - we essentially need to do another filter pass on pods and inspect if the assigned node is part of our tainted list

@dtnyn thoughts?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think either way will address the specific problem about inaccurate numerator, but exclusion does seem more semantically accurate since like you mentioned they aren't actually "available capacity" so the same treatment to their workload make sense.

I don't think the extra check would be changing the time scaling cost. Since a cursory glance should be able to achieve it via checking membership in a set of tainted node which should be a subset of the loop we're already doing with mapPodsToNode() which goes through all current pods

log.WithField("nodegroup", nodegroup).Infof("Including %v tainted nodes in capacity calculation (total capacity nodes: %v)", len(taintedNodes), len(capacityNodes))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be at info level? This log will run at every-tick and could be very noisy. Can this be changed to debug level, or only logging when len(taintedNodes) > 0.

}

// Metrics and Logs
log.WithField("nodegroup", nodegroup).Infof("pods total: %v", len(pods))
log.WithField("nodegroup", nodegroup).Infof("nodes remaining total: %v", len(allNodes))
Expand Down Expand Up @@ -274,14 +281,14 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
// for working out which pods are on which nodes
nodeGroup.NodeInfoMap = k8s.CreateNodeNameToInfoMap(pods, allNodes)

// Calc capacity for untainted nodes
// Calc capacity for nodes included in utilisation calculation
podRequests, err := k8s.CalculatePodsRequestedUsage(pods)
if err != nil {
log.Errorf("Failed to calculate requests: %v", err)
return 0, err
}

nodeCapacity, err := k8s.CalculateNodesCapacity(untaintedNodes, pods)
nodeCapacity, err := k8s.CalculateNodesCapacity(capacityNodes, pods)
if err != nil {
log.Errorf("Failed to calculate capacity: %v", err)
return 0, err
Expand Down Expand Up @@ -319,14 +326,14 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
}

// Calc %
// both cpu and memory capacity are based on number of untainted nodes
// pass number of untainted nodes in to help make decision if it's a scaling-up-from-0
// both cpu and memory capacity are based on number of capacity nodes
// pass number of capacity nodes in to help make decision if it's a scaling-up-from-0
cpuPercent, memPercent, err := calcPercentUsage(
*podRequests.Total.GetCPUQuantity(),
*podRequests.Total.GetMemoryQuantity(),
*nodeCapacity.Total.GetCPUQuantity(),
*nodeCapacity.Total.GetMemoryQuantity(),
int64(len(untaintedNodes)))
int64(len(capacityNodes)))
if err != nil {
log.Errorf("Failed to calculate percentages: %v", err)
return 0, err
Expand Down Expand Up @@ -374,7 +381,7 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
// we want to add enough nodes such that the maxPercentage cluster util
// drops back below ScaleUpThresholdPercent
nodesDelta, err = calcScaleUpDelta(
untaintedNodes,
capacityNodes,
cpuPercent,
memPercent,
*podRequests.Total.GetCPUQuantity(),
Expand Down
218 changes: 218 additions & 0 deletions pkg/controller/controller_scale_node_group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1497,3 +1497,221 @@ func TestScaleNodeGroupNodeMaxAge(t *testing.T) {
})
}
}

func TestIncludeTaintedInCapacity(t *testing.T) {
t.Run("tainted nodes included in denominator prevents artificial utilisation spike", func(t *testing.T) {
nodeGroup := NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 2,
MaxNodes: 20,
TaintLowerCapacityThresholdPercent: 40,
TaintUpperCapacityThresholdPercent: 60,
ScaleUpThresholdPercent: 80,
FastNodeRemovalRate: 3,
SlowNodeRemovalRate: 1,
SoftDeleteGracePeriod: "5m",
HardDeleteGracePeriod: "10m",
ScaleUpCoolDownPeriod: "1m",
TaintEffect: "NoSchedule",
IncludeTaintedInCapacity: true,
}
nodeGroups := []NodeGroupOptions{nodeGroup}

// 5 untainted nodes + 5 tainted nodes = 10 total
untaintedNodes := test.BuildTestNodes(5, test.NodeOpts{
CPU: 1000,
Mem: 1000,
})
taintedNodes := test.BuildTestNodes(5, test.NodeOpts{
CPU: 1000,
Mem: 1000,
Tainted: true,
})
allNodes := append(untaintedNodes, taintedNodes...)

pods := test.BuildTestPods(9, test.PodOpts{
CPU: []int64{500},
Mem: []int64{500},
})

client, opts, err := buildTestClient(allNodes, pods, nodeGroups, ListerOptions{})
require.NoError(t, err)

testCloudProvider := test.NewCloudProvider(1)
testNodeGroup := test.NewNodeGroup(
nodeGroup.CloudProviderGroupName,
nodeGroup.Name,
int64(nodeGroup.MinNodes),
int64(nodeGroup.MaxNodes),
int64(len(allNodes)),
)
testCloudProvider.RegisterNodeGroup(testNodeGroup)

nodeGroupsState := BuildNodeGroupsState(nodeGroupsStateOpts{
nodeGroups: nodeGroups,
client: *client,
})

controller := &Controller{
Client: client,
Opts: opts,
stopChan: nil,
nodeGroups: nodeGroupsState,
cloudProvider: testCloudProvider,
}

nodesDelta, err := controller.scaleNodeGroup(nodeGroup.Name, nodeGroupsState[nodeGroup.Name])
require.NoError(t, err)

// With flag enabled, utilisation = 45% (not 90%), so no scale up
assert.LessOrEqual(t, nodesDelta, 0)
})

t.Run("flag disabled preserves existing behavior", func(t *testing.T) {
nodeGroup := NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 2,
MaxNodes: 20,
TaintLowerCapacityThresholdPercent: 40,
TaintUpperCapacityThresholdPercent: 60,
ScaleUpThresholdPercent: 80,
FastNodeRemovalRate: 3,
SlowNodeRemovalRate: 1,
SoftDeleteGracePeriod: "5m",
HardDeleteGracePeriod: "10m",
ScaleUpCoolDownPeriod: "1m",
TaintEffect: "NoSchedule",
IncludeTaintedInCapacity: false, // Default behavior
}
nodeGroups := []NodeGroupOptions{nodeGroup}

// 5 untainted nodes + 5 tainted nodes = 10 total
untaintedNodes := test.BuildTestNodes(5, test.NodeOpts{
CPU: 1000,
Mem: 1000,
})
taintedNodes := test.BuildTestNodes(5, test.NodeOpts{
CPU: 1000,
Mem: 1000,
Tainted: true,
})
allNodes := append(untaintedNodes, taintedNodes...)

pods := test.BuildTestPods(9, test.PodOpts{
CPU: []int64{500},
Mem: []int64{500},
})

client, opts, err := buildTestClient(allNodes, pods, nodeGroups, ListerOptions{})
require.NoError(t, err)

testCloudProvider := test.NewCloudProvider(1)
testNodeGroup := test.NewNodeGroup(
nodeGroup.CloudProviderGroupName,
nodeGroup.Name,
int64(nodeGroup.MinNodes),
int64(nodeGroup.MaxNodes),
int64(len(allNodes)),
)
testCloudProvider.RegisterNodeGroup(testNodeGroup)

nodeGroupsState := BuildNodeGroupsState(nodeGroupsStateOpts{
nodeGroups: nodeGroups,
client: *client,
})

controller := &Controller{
Client: client,
Opts: opts,
stopChan: nil,
nodeGroups: nodeGroupsState,
cloudProvider: testCloudProvider,
}

nodesDelta, err := controller.scaleNodeGroup(nodeGroup.Name, nodeGroupsState[nodeGroup.Name])
require.NoError(t, err)

// With flag disabled, utilisation = 90% > 80%, triggers scale up
assert.Greater(t, nodesDelta, 0)
})

t.Run("force-tainted and cordoned nodes are never included", func(t *testing.T) {
nodeGroup := NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 2,
MaxNodes: 20,
TaintLowerCapacityThresholdPercent: 40,
TaintUpperCapacityThresholdPercent: 60,
ScaleUpThresholdPercent: 80,
FastNodeRemovalRate: 3,
SlowNodeRemovalRate: 1,
SoftDeleteGracePeriod: "5m",
HardDeleteGracePeriod: "10m",
ScaleUpCoolDownPeriod: "1m",
TaintEffect: "NoSchedule",
IncludeTaintedInCapacity: true,
}
nodeGroups := []NodeGroupOptions{nodeGroup}

untaintedNodes := test.BuildTestNodes(3, test.NodeOpts{
CPU: 1000,
Mem: 1000,
})
taintedNodes := test.BuildTestNodes(2, test.NodeOpts{
CPU: 1000,
Mem: 1000,
Tainted: true,
})
forceTaintedNodes := test.BuildTestNodes(2, test.NodeOpts{
CPU: 1000,
Mem: 1000,
ForceTainted: true,
})
cordonedNodes := test.BuildTestNodes(2, test.NodeOpts{
CPU: 1000,
Mem: 1000,
Unschedulable: true,
})
allNodes := append(append(append(untaintedNodes, taintedNodes...), forceTaintedNodes...), cordonedNodes...)

pods := test.BuildTestPods(5, test.PodOpts{
CPU: []int64{500},
Mem: []int64{500},
})

client, opts, err := buildTestClient(allNodes, pods, nodeGroups, ListerOptions{})
require.NoError(t, err)

testCloudProvider := test.NewCloudProvider(1)
testNodeGroup := test.NewNodeGroup(
nodeGroup.CloudProviderGroupName,
nodeGroup.Name,
int64(nodeGroup.MinNodes),
int64(nodeGroup.MaxNodes),
int64(len(allNodes)),
)
testCloudProvider.RegisterNodeGroup(testNodeGroup)

nodeGroupsState := BuildNodeGroupsState(nodeGroupsStateOpts{
nodeGroups: nodeGroups,
client: *client,
})

controller := &Controller{
Client: client,
Opts: opts,
stopChan: nil,
nodeGroups: nodeGroupsState,
cloudProvider: testCloudProvider,
}

nodesDelta, err := controller.scaleNodeGroup(nodeGroup.Name, nodeGroupsState[nodeGroup.Name])
require.NoError(t, err)

// Capacity = 5 nodes (3 untainted + 2 tainted), utilisation = 50%
assert.Equal(t, -nodeGroup.SlowNodeRemovalRate, nodesDelta)
})
}
4 changes: 4 additions & 0 deletions pkg/controller/node_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ type NodeGroupOptions struct {
// allowed in the nodegroup at any given time.
MaxUnhealthyNodesPercent int `json:"max_unhealthy_nodes_percent,omitempty" yaml:"max_unhealthy_nodes_percent,omitempty"`

// IncludeTaintedInCapacity includes tainted nodes in the capacity denominator
// for utilisation calculations, preventing artificial spikes when nodes are tainted.
IncludeTaintedInCapacity bool `json:"include_tainted_in_capacity,omitempty" yaml:"include_tainted_in_capacity,omitempty"`
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// Private variables for storing the parsed duration from the string
softDeleteGracePeriodDuration time.Duration
hardDeleteGracePeriodDuration time.Duration
Expand Down
Loading