From 2529674ab1bb0b6f2e7490751aa31b24018e16fd Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 09:41:04 -0700 Subject: [PATCH] test: Add Expiration testing for E2E for v1beta1 (#4887) --- .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 2 + test/pkg/environment/common/environment.go | 8 + test/pkg/environment/common/expectations.go | 36 +- .../suites/beta/expiration/expiration_test.go | 355 ++++++++++++++++++ 5 files changed, 401 insertions(+), 2 deletions(-) create mode 100644 test/suites/beta/expiration/expiration_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index a82e5cfa2b2f..6e76c78540f3 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/Expiration, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index fde21f6bcec9..0da41c069041 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -18,6 +18,8 @@ on: - Beta/Integration - Beta/Drift - Beta/Consolidation + - Beta/Expiration + - Beta/NodeClaim - Alpha/Integration - Alpha/Machine - Alpha/Consolidation diff --git a/test/pkg/environment/common/environment.go b/test/pkg/environment/common/environment.go index 026aa71f8368..bab195b7c677 100644 --- a/test/pkg/environment/common/environment.go +++ b/test/pkg/environment/common/environment.go @@ -37,6 +37,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" coreapis "github.com/aws/karpenter-core/pkg/apis" + "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/operator" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter/pkg/apis" @@ -116,6 +117,13 @@ func NewClient(ctx context.Context, config *rest.Config) client.Client { node := o.(*v1.Node) return []string{strconv.FormatBool(node.Spec.Unschedulable)} })) + lo.Must0(cache.IndexField(ctx, &v1.Node{}, "spec.taints[*].karpenter.sh/disruption", func(o client.Object) []string { + node := o.(*v1.Node) + t, _ := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { + return t.Key == v1beta1.DisruptionTaintKey + }) + return []string{t.Value} + })) c := lo.Must(client.New(config, client.Options{Scheme: scheme, Cache: &client.CacheOptions{Reader: cache}})) diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index 8f3844da19b8..d0a9905cb610 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -460,7 +460,7 @@ func (env *Environment) ConsistentlyExpectMachineCount(comparator string, count return lo.ToSlicePtr(machineList.Items) } -func (env *Environment) EventuallyExpectCordonedNodeCount(comparator string, count int) []*v1.Node { +func (env *Environment) EventuallyExpectCordonedNodeCountLegacy(comparator string, count int) []*v1.Node { GinkgoHelper() By(fmt.Sprintf("waiting for cordoned nodes to be %s to %d", comparator, count)) nodeList := &v1.NodeList{} @@ -472,6 +472,40 @@ func (env *Environment) EventuallyExpectCordonedNodeCount(comparator string, cou return lo.ToSlicePtr(nodeList.Items) } +func (env *Environment) EventuallyExpectNodesUncordonedLegacyWithTimeout(timeout time.Duration, nodes ...*v1.Node) { + GinkgoHelper() + By(fmt.Sprintf("waiting for %d nodes to be uncordoned", len(nodes))) + nodeList := &v1.NodeList{} + Eventually(func(g Gomega) { + g.Expect(env.Client.List(env, nodeList, client.MatchingFields{"spec.unschedulable": "true"})).To(Succeed()) + cordonedNodeNames := lo.Map(nodeList.Items, func(n v1.Node, _ int) string { return n.Name }) + g.Expect(cordonedNodeNames).ToNot(ContainElements(lo.Map(nodes, func(n *v1.Node, _ int) interface{} { return n.Name })...)) + }).WithTimeout(timeout).Should(Succeed()) +} + +func (env *Environment) EventuallyExpectCordonedNodeCount(comparator string, count int) []*v1.Node { + GinkgoHelper() + By(fmt.Sprintf("waiting for cordoned nodes to be %s to %d", comparator, count)) + nodeList := &v1.NodeList{} + Eventually(func(g Gomega) { + g.Expect(env.Client.List(env, nodeList, client.MatchingFields{"spec.taints[*].karpenter.sh/disruption": "disrupting"})).To(Succeed()) + g.Expect(len(nodeList.Items)).To(BeNumerically(comparator, count), + fmt.Sprintf("expected %d cordoned nodes, had %d (%v)", count, len(nodeList.Items), NodeNames(lo.ToSlicePtr(nodeList.Items)))) + }).Should(Succeed()) + return lo.ToSlicePtr(nodeList.Items) +} + +func (env *Environment) EventuallyExpectNodesUncordonedWithTimeout(timeout time.Duration, nodes ...*v1.Node) { + GinkgoHelper() + By(fmt.Sprintf("waiting for %d nodes to be uncordoned", len(nodes))) + nodeList := &v1.NodeList{} + Eventually(func(g Gomega) { + g.Expect(env.Client.List(env, nodeList, client.MatchingFields{"spec.taints[*].karpenter.sh/disruption": "disrupting"})).To(Succeed()) + cordonedNodeNames := lo.Map(nodeList.Items, func(n v1.Node, _ int) string { return n.Name }) + g.Expect(cordonedNodeNames).ToNot(ContainElements(lo.Map(nodes, func(n *v1.Node, _ int) interface{} { return n.Name })...)) + }).WithTimeout(timeout).Should(Succeed()) +} + func (env *Environment) EventuallyExpectNodeCount(comparator string, count int) []*v1.Node { GinkgoHelper() By(fmt.Sprintf("waiting for nodes to be %s to %d", comparator, count)) diff --git a/test/suites/beta/expiration/expiration_test.go b/test/suites/beta/expiration/expiration_test.go new file mode 100644 index 000000000000..224d12827e8f --- /dev/null +++ b/test/suites/beta/expiration/expiration_test.go @@ -0,0 +1,355 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package expiration_test + +import ( + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/aws-sdk-go/service/ssm" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/environment/aws" + + coretest "github.com/aws/karpenter-core/pkg/test" +) + +var env *aws.Environment +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool + +func TestExpiration(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/Expiration") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + nodePool = coretest.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }, + Disruption: corev1beta1.Disruption{ + ExpireAfter: corev1beta1.NillableDuration{Duration: lo.ToPtr(time.Second * 30)}, + }, + }, + }) +}) + +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Expiration", func() { + It("should expire the node after the expiration is reached", func() { + var numPods int32 = 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + corev1beta1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + env.Monitor.Reset() // Reset the monitor so that we can expect a single node to be spun up after expiration + + // Expect that the NodeClaim will get an expired status condition + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + // Remove the do-not-disrupt annotation so that the Nodes are now deprovisionable + for _, pod := range env.ExpectPodsMatchingSelector(selector) { + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + } + + // Eventually the node will be set as unschedulable, which means its actively being deprovisioned + Eventually(func(g Gomega) { + n := &v1.Node{} + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), n)).Should(Succeed()) + _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { + return corev1beta1.IsDisruptingTaint(t) + }) + g.Expect(ok).To(BeTrue()) + }).Should(Succeed()) + + // Set the expireAfter to "Never" to make sure new node isn't deleted + // This is CRITICAL since it prevents nodes that are immediately spun up from immediately being expired and + // racing at the end of the E2E test, leaking node resources into subsequent tests + nodePool.Spec.Disruption.ExpireAfter.Duration = nil + env.ExpectUpdated(nodePool) + + // After the deletion timestamp is set and all pods are drained + // the node should be gone + env.EventuallyExpectNotFound(nodeClaim, node) + + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + }) + It("should replace expired node with a single node and schedule all pods", func() { + var numPods int32 = 5 + // We should setup a PDB that will only allow a minimum of 1 pod to be pending at a time + minAvailable := intstr.FromInt32(numPods - 1) + pdb := coretest.PodDisruptionBudget(coretest.PDBOptions{ + Labels: map[string]string{ + "app": "large-app", + }, + MinAvailable: &minAvailable, + }) + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + corev1beta1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, pdb, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + env.Monitor.Reset() // Reset the monitor so that we can expect a single node to be spun up after expiration + + // Set the expireAfter value to get the node deleted + nodePool.Spec.Disruption.ExpireAfter.Duration = lo.ToPtr(time.Minute) + env.ExpectUpdated(nodePool) + + // Expect that the NodeClaim will get an expired status condition + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + // Remove the do-not-disruption annotation so that the Nodes are now deprovisionable + for _, pod := range env.ExpectPodsMatchingSelector(selector) { + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + } + + // Eventually the node will be set as unschedulable, which means its actively being deprovisioned + Eventually(func(g Gomega) { + n := &v1.Node{} + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), n)).Should(Succeed()) + _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { + return corev1beta1.IsDisruptingTaint(t) + }) + g.Expect(ok).To(BeTrue()) + }).Should(Succeed()) + + // Set the expireAfter to "Never" to make sure new node isn't deleted + // This is CRITICAL since it prevents nodes that are immediately spun up from immediately being expired and + // racing at the end of the E2E test, leaking node resources into subsequent tests + nodePool.Spec.Disruption.ExpireAfter.Duration = nil + env.ExpectUpdated(nodePool) + + // After the deletion timestamp is set and all pods are drained + // the node should be gone + env.EventuallyExpectNotFound(nodeClaim, node) + + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + }) + Context("Expiration Failure", func() { + It("should not continue to expire if a node never registers", func() { + // Launch a new NodeClaim + var numPods int32 = 2 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []v1.PodAffinityTerm{{ + TopologyKey: v1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Set a configuration that will not register a NodeClaim + parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: lo.ToPtr("/aws/service/ami-amazon-linux-latest/amzn-ami-hvm-x86_64-ebs"), + }) + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: *parameter.Parameter.Value, + }, + } + env.ExpectCreatedOrUpdated(nodeClass) + + // Should see the NodeClaim has expired + Eventually(func(g Gomega) { + for _, nc := range startingNodeClaimState { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nc), nc)).To(Succeed()) + g.Expect(nc.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + } + }).Should(Succeed()) + + // Expect nodes To get cordoned + cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + + // Expire should fail and the original node should be uncordoned + // TODO: reduce timeouts when deprovisioning waits are factored out + env.EventuallyExpectNodesUncordonedWithTimeout(11*time.Minute, cordonedNodes...) + + // The nodeclaims that never registers will be removed + Eventually(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + g.Expect(len(nodeClaims.Items)).To(BeNumerically("==", int(numPods))) + }).WithTimeout(6 * time.Minute).Should(Succeed()) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + Consistently(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + + startingNodeClaimUIDs := lo.Map(startingNodeClaimState, func(nc *corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + nodeClaimUIDs := lo.Map(nodeClaims.Items, func(nc corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + g.Expect(sets.New(nodeClaimUIDs...).IsSuperset(sets.New(startingNodeClaimUIDs...))).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + It("should not continue to expiration if a node registers but never becomes initialized", func() { + // Set a configuration that will allow us to make a NodeClaim not be initialized + nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com/taint", Effect: v1.TaintEffectPreferNoSchedule}} + + // Launch a new NodeClaim + var numPods int32 = 2 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []v1.PodAffinityTerm{{ + TopologyKey: v1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + nodes := env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Remove the startup taints from these nodes to initialize them + Eventually(func(g Gomega) { + for _, node := range nodes { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + stored := node.DeepCopy() + node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t v1.Taint, _ int) bool { return t.Key == "example.com/taint" }) + g.Expect(env.Client.Patch(env.Context, node, client.MergeFrom(stored))).To(Succeed()) + } + }).Should(Succeed()) + + // Should see the NodeClaim has expired + Eventually(func(g Gomega) { + for _, nc := range startingNodeClaimState { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nc), nc)).To(Succeed()) + g.Expect(nc.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + } + }).Should(Succeed()) + + // Expect nodes To be cordoned + cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + + // Expire should fail and original node should be uncordoned and no NodeClaims should be removed + // TODO: reduce timeouts when deprovisioning waits are factored out + env.EventuallyExpectNodesUncordonedWithTimeout(11*time.Minute, cordonedNodes...) + + // Expect that the new NodeClaim/Node is kept around after the un-cordon + nodeList := &v1.NodeList{} + Expect(env.Client.List(env, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeList.Items).To(HaveLen(int(numPods) + 1)) + + nodeClaimList := &corev1beta1.NodeClaimList{} + Expect(env.Client.List(env, nodeClaimList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeClaimList.Items).To(HaveLen(int(numPods) + 1)) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + Consistently(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + + startingNodeClaimUIDs := lo.Map(startingNodeClaimState, func(nc *corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + nodeClaimUIDs := lo.Map(nodeClaims.Items, func(nc corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + g.Expect(sets.New(nodeClaimUIDs...).IsSuperset(sets.New(startingNodeClaimUIDs...))).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + }) +})