Skip to content

Commit

Permalink
PAASTA-18198: add pool_node_affinities to improve Karpenter AZ balance (
Browse files Browse the repository at this point in the history
#3849)

This adds a new system config option, `pool_node_affinities`.
It's necessary to configure pods with a preselection of preferred 2-3 AZs per pool so that Karpenter does not try to consider all 6+ AZs in a region for scheduling decisions.

This option is a temporary solution and the proper fix will likely be to adopt something like the descheduler to achieve a better AZ balance (along with other benefits).
 
Signed-off-by: Max Falk <gfalk@yelp.com>
  • Loading branch information
gmdfalk committed May 2, 2024
1 parent 54504ed commit 5e91e8c
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 3 deletions.
33 changes: 30 additions & 3 deletions paasta_tools/kubernetes_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,13 @@
"hostname",
"owner",
}
ZONE_LABELS = (
"topology.kubernetes.io/zone",
"yelp.com/habitat",
"yelp.com/eni_config",
"karpenter.sh/nodepool",
"topology.ebs.csi.aws.com/zone",
)

GPU_RESOURCE_NAME = "nvidia.com/gpu"
DEFAULT_STORAGE_CLASS_NAME = "ebs"
Expand Down Expand Up @@ -681,6 +688,10 @@ def registration_label(namespace: str) -> str:
return f"registrations.{PAASTA_ATTRIBUTE_PREFIX}{limited_namespace}"


def contains_zone_label(node_selectors: Dict[str, NodeSelectorConfig]) -> bool:
return any(k in node_selectors for k in ZONE_LABELS)


class KubernetesDeploymentConfig(LongRunningServiceConfig):
config_dict: KubernetesDeploymentConfigDict

Expand Down Expand Up @@ -2139,7 +2150,9 @@ def get_pod_template_spec(
)
# need to check if there are node selectors/affinities. if there are none
# and we create an empty affinity object, k8s will deselect all nodes.
node_affinity = self.get_node_affinity()
node_affinity = self.get_node_affinity(
system_paasta_config.get_pool_node_affinities()
)
if node_affinity is not None:
pod_spec_kwargs["affinity"] = V1Affinity(node_affinity=node_affinity)

Expand Down Expand Up @@ -2283,7 +2296,9 @@ def get_node_selector(self) -> Mapping[str, str]:
node_selectors["yelp.com/pool"] = self.get_pool()
return node_selectors

def get_node_affinity(self) -> Optional[V1NodeAffinity]:
def get_node_affinity(
self, pool_node_affinities: Dict[str, Dict[str, List[str]]] = None
) -> Optional[V1NodeAffinity]:
"""Converts deploy_whitelist and deploy_blacklist in node affinities.
note: At the time of writing, `kubectl describe` does not show affinities,
Expand All @@ -2293,12 +2308,24 @@ def get_node_affinity(self) -> Optional[V1NodeAffinity]:
allowlist=self.get_deploy_whitelist(),
denylist=self.get_deploy_blacklist(),
)
node_selectors = self.config_dict.get("node_selectors", {})
requirements.extend(
raw_selectors_to_requirements(
raw_selectors=self.config_dict.get("node_selectors", {}),
raw_selectors=node_selectors,
)
)

# PAASTA-18198: To improve AZ balance with Karpenter, we temporarily allow specifying zone affinities per pool
if pool_node_affinities and self.get_pool() in pool_node_affinities:
current_pool_node_affinities = pool_node_affinities[self.get_pool()]
# If the service already has a node selector for a zone, we don't want to override it
if current_pool_node_affinities and not contains_zone_label(node_selectors):
requirements.extend(
raw_selectors_to_requirements(
raw_selectors=current_pool_node_affinities,
)
)

preferred_terms = []
for node_selectors_prefered_config_dict in self.config_dict.get(
"node_selectors_preferred", []
Expand Down
5 changes: 5 additions & 0 deletions paasta_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2020,6 +2020,7 @@ class SystemPaastaConfigDict(TypedDict, total=False):
pdb_max_unavailable: Union[str, int]
pki_backend: str
pod_defaults: Dict[str, Any]
pool_node_affinities: Dict[str, Dict[str, List[str]]]
topology_spread_constraints: List[TopologySpreadConstraintDict]
previous_marathon_servers: List[MarathonConfigDict]
readiness_check_prefix_template: List[str]
Expand Down Expand Up @@ -2645,6 +2646,10 @@ def get_taskproc(self) -> Dict:
def get_disabled_watchers(self) -> List:
return self.config_dict.get("disabled_watchers", [])

def get_pool_node_affinities(self) -> Dict[str, Dict[str, List[str]]]:
"""Node selectors that will be applied to all Pods in a pool"""
return self.config_dict.get("pool_node_affinities", {})

def get_topology_spread_constraints(self) -> List[TopologySpreadConstraintDict]:
"""List of TopologySpreadConstraints that will be applied to all Pods in the cluster"""
return self.config_dict.get("topology_spread_constraints", [])
Expand Down
112 changes: 112 additions & 0 deletions tests/test_kubernetes_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2029,6 +2029,118 @@ def test_get_node_affinity_with_preferences(self):
],
)

def test_get_node_affinity_no_reqs_with_global_override(self):
"""
Given global node affinity overrides and no deployment specific requirements, the globals should be used
"""
assert self.deployment.get_node_affinity(
{"default": {"topology.kubernetes.io/zone": ["us-west-1a", "us-west-1b"]}},
) == V1NodeAffinity(
required_during_scheduling_ignored_during_execution=V1NodeSelector(
node_selector_terms=[
V1NodeSelectorTerm(
match_expressions=[
V1NodeSelectorRequirement(
key="topology.kubernetes.io/zone",
operator="In",
values=["us-west-1a", "us-west-1b"],
)
]
)
],
),
)

def test_get_node_affinity_no_reqs_with_global_override_and_deployment_config(self):
"""
Given global node affinity overrides and deployment specific requirements, globals should be ignored
"""
deployment = KubernetesDeploymentConfig(
service="kurupt",
instance="fm",
cluster="brentford",
config_dict={
"node_selectors": {"topology.kubernetes.io/zone": ["us-west-1a"]},
"node_selectors_preferred": [
{
"weight": 1,
"preferences": {
"instance_type": ["a1.1xlarge"],
},
}
],
},
branch_dict=None,
soa_dir="/nail/blah",
)
actual = deployment.get_node_affinity(
{"default": {"topology.kubernetes.io/zone": ["us-west-1a", "us-west-1b"]}},
)
expected = V1NodeAffinity(
required_during_scheduling_ignored_during_execution=V1NodeSelector(
node_selector_terms=[
V1NodeSelectorTerm(
match_expressions=[
V1NodeSelectorRequirement(
key="topology.kubernetes.io/zone",
operator="In",
values=["us-west-1a"],
),
]
)
],
),
preferred_during_scheduling_ignored_during_execution=[
V1PreferredSchedulingTerm(
weight=1,
preference=V1NodeSelectorTerm(
match_expressions=[
V1NodeSelectorRequirement(
key="node.kubernetes.io/instance-type",
operator="In",
values=["a1.1xlarge"],
),
]
),
)
],
)
assert actual == expected

def test_get_node_affinity_no_reqs_with_global_override_and_deployment_config_habitat(
self,
):
"""
Given global node affinity overrides and deployment specific zone selector, globals should be ignored
"""
deployment = KubernetesDeploymentConfig(
service="kurupt",
instance="fm",
cluster="brentford",
config_dict={"node_selectors": {"yelp.com/habitat": ["uswest1astagef"]}},
branch_dict=None,
soa_dir="/nail/blah",
)
actual = deployment.get_node_affinity(
{"default": {"topology.kubernetes.io/zone": ["us-west-1a", "us-west-1b"]}},
)
expected = V1NodeAffinity(
required_during_scheduling_ignored_during_execution=V1NodeSelector(
node_selector_terms=[
V1NodeSelectorTerm(
match_expressions=[
V1NodeSelectorRequirement(
key="yelp.com/habitat",
operator="In",
values=["uswest1astagef"],
),
]
)
],
)
)
assert actual == expected

@pytest.mark.parametrize(
"anti_affinity,expected",
[
Expand Down

0 comments on commit 5e91e8c

Please sign in to comment.