Skip to content

Commit

Permalink
Feat: Nodes are now owned by Provisioners, and will cascade on delete…
Browse files Browse the repository at this point in the history
… by default (#1934)

* Feat: Nodes are now owned by Provisioners, and will cascade on delete by default

* added upgrade docs
  • Loading branch information
ellistarn committed Jun 16, 2022
1 parent eda8fd7 commit bc4564e
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 14 deletions.
9 changes: 3 additions & 6 deletions charts/karpenter/templates/clusterrole.yaml
Expand Up @@ -25,16 +25,13 @@ rules:
resources: ["storageclasses"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch", "patch", "delete"]
- apiGroups: [""]
resources: ["configmaps"]
resources: ["pods", "nodes", "configmaps"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["create"]
verbs: ["create", "patch", "delete"]
- apiGroups: [""]
resources: ["pods/binding", "pods/eviction"]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
Expand Down
19 changes: 13 additions & 6 deletions pkg/controllers/node/finalizer.go
Expand Up @@ -18,10 +18,12 @@ import (
"context"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"knative.dev/pkg/ptr"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5"
"github.com/aws/karpenter/pkg/utils/functional"
)

// Finalizer is a subreconciler that ensures nodes have the termination
Expand All @@ -31,12 +33,17 @@ import (
type Finalizer struct{}

// Reconcile reconciles the node
func (r *Finalizer) Reconcile(_ context.Context, _ *v1alpha5.Provisioner, n *v1.Node) (reconcile.Result, error) {
if !n.DeletionTimestamp.IsZero() {
func (r *Finalizer) Reconcile(_ context.Context, provisioner *v1alpha5.Provisioner, node *v1.Node) (reconcile.Result, error) {
if !node.DeletionTimestamp.IsZero() {
return reconcile.Result{}, nil
}
if !functional.ContainsString(n.Finalizers, v1alpha5.TerminationFinalizer) {
n.Finalizers = append(n.Finalizers, v1alpha5.TerminationFinalizer)
}
node.OwnerReferences = []metav1.OwnerReference{{
APIVersion: v1alpha5.SchemeGroupVersion.String(),
Kind: "Provisioner",
Name: provisioner.Name,
UID: provisioner.UID,
BlockOwnerDeletion: ptr.Bool(true),
}}
controllerutil.AddFinalizer(node, v1alpha5.TerminationFinalizer)
return reconcile.Result{}, nil
}
15 changes: 15 additions & 0 deletions pkg/controllers/node/suite_test.go
Expand Up @@ -289,5 +289,20 @@ var _ = Describe("Controller", func() {
n = ExpectNodeExists(ctx, env.Client, n.Name)
Expect(n.Finalizers).To(Equal(n.Finalizers))
})
It("should add an owner reference to the node", func() {
n := test.Node(test.NodeOptions{ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{v1alpha5.ProvisionerNameLabelKey: provisioner.Name},
}})
ExpectApplied(ctx, env.Client, provisioner, n)
ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(n))
n = ExpectNodeExists(ctx, env.Client, n.Name)
Expect(n.OwnerReferences).To(Equal([]metav1.OwnerReference{{
APIVersion: v1alpha5.SchemeGroupVersion.String(),
Kind: "Provisioner",
Name: provisioner.Name,
UID: provisioner.UID,
BlockOwnerDeletion: ptr.Bool(true),
}}))
})
})
})
4 changes: 4 additions & 0 deletions pkg/controllers/provisioning/provisioner.go
Expand Up @@ -169,6 +169,7 @@ func (p *Provisioner) getPods(ctx context.Context) ([]*v1.Pod, error) {
return pods, nil
}

// nolint: gocyclo
func (p *Provisioner) schedule(ctx context.Context, pods []*v1.Pod) ([]*scheduler.Node, error) {
defer metrics.Measure(schedulingDuration.WithLabelValues(injection.GetNamespacedName(ctx).Name))()

Expand All @@ -182,6 +183,9 @@ func (p *Provisioner) schedule(ctx context.Context, pods []*v1.Pod) ([]*schedule
}
for i := range provisionerList.Items {
provisioner := &provisionerList.Items[i]
if !provisioner.DeletionTimestamp.IsZero() {
continue
}
// Create node template
nodeTemplates = append(nodeTemplates, scheduling.NewNodeTemplate(provisioner))
// Get instance type options
Expand Down
11 changes: 11 additions & 0 deletions pkg/controllers/provisioning/suite_test.go
Expand Up @@ -17,6 +17,7 @@ package provisioning_test
import (
"context"
"testing"
"time"

"github.com/aws/karpenter/pkg/cloudprovider"

Expand Down Expand Up @@ -90,6 +91,16 @@ var _ = Describe("Provisioning", func() {
ExpectScheduled(ctx, env.Client, pod)
}
})
It("should ignore provisioners that are deleting", func() {
ExpectApplied(ctx, env.Client, test.Provisioner(test.ProvisionerOptions{ObjectMeta: metav1.ObjectMeta{DeletionTimestamp: &metav1.Time{Time: time.Now()}}}))
pods := ExpectProvisioned(ctx, env.Client, controller, test.UnschedulablePod())
nodes := &v1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
Expect(len(nodes.Items)).To(Equal(0))
for _, pod := range pods {
ExpectNotScheduled(ctx, env.Client, pod)
}
})
It("should provision nodes for pods with supported node selectors", func() {
provisioner := test.Provisioner()
schedulable := []*v1.Pod{
Expand Down
1 change: 1 addition & 0 deletions website/content/en/preview/tasks/deprovisioning.md
Expand Up @@ -19,6 +19,7 @@ These include:

There are both automated and manual ways of deprovisioning nodes provisioned by Karpenter:

* **Provisioner Deletion**: Nodes are considered to be "owned" by the Provisioner that launched them. Karpenter will gracefully terminate nodes when a provisioner is deleted. Nodes may be reparented to another provisioner by modifying their labels. For example: `kubectl label node -l karpenter.sh/provisioner-name=source-provisioner-name karpenter.sh/provisioner-name=destination-provisioner-name --overwrite`.
* **Node empty**: Karpenter notes when the last workload (non-daemonset) pod stops running on a node. From that point, Karpenter waits the number of seconds set by `ttlSecondsAfterEmpty` in the provisioner, then Karpenter requests to delete the node. This feature can keep costs down by removing nodes that are no longer being used for workloads.
* **Node expired**: Karpenter requests to delete the node after a set number of seconds, based on the provisioner `ttlSecondsUntilExpired` value, from the time the node was provisioned. One use case for node expiry is to handle node upgrades. Old nodes (with a potentially outdated Kubernetes version or operating system) are deleted, and replaced with nodes on the current version (assuming that you requested the latest version, rather than a specific version).

Expand Down
7 changes: 5 additions & 2 deletions website/content/en/preview/upgrade-guide/_index.md
Expand Up @@ -46,7 +46,7 @@ When there is a breaking change we will:

Besides the peer review process for all changes to the code base we also do the followings in order to find
incompatibilities:
* (To be implemented) To check the compatibility of the application, we will automate tests for installing, uninstalling, upgrading from an older version, and downgrading to an older version
* (To be implemented) To check the compatibility of the application, we will automate tests for installing, uninstalling, upgrading from an older version, and downgrading to an older version
* (To be implemented) To check the compatibility of the documentation with the application, we will turn the commands in our documentation into scripts that we can automatically run

## Nightly Builds
Expand All @@ -69,9 +69,12 @@ for a subset of older versions and deprecate the others.

# Released Upgrade Notes

## Upgrading to v0.12.0+
v0.12.0 adds an OwnerReference to each Node created by a provisioner. Previously, deleting a provisioner would orphan nodes. Now, deleting a provisioner will cause Kubernetes [cascading delete](https://kubernetes.io/docs/concepts/architecture/garbage-collection/#cascading-deletion) logic to gracefully terminate the nodes using the Karpenter node finalizer. You may still orphan nodes by removing the owner reference.

## Upgrading to v0.11.0+

v0.11.0 changes the way that the `vpc.amazonaws.com/pod-eni` resource is reported. Instead of being reported for all nodes that could support the resources regardless of if the cluster is configured to support it, it is now controlled by a command line flag or environment variable. The parameter defaults to false and must be set if your cluster uses [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html). This can be enabled by setting the environment variable `AWS_ENABLE_POD_ENI` to true via the helm value `controller.env`.
v0.11.0 changes the way that the `vpc.amazonaws.com/pod-eni` resource is reported. Instead of being reported for all nodes that could support the resources regardless of if the cluster is configured to support it, it is now controlled by a command line flag or environment variable. The parameter defaults to false and must be set if your cluster uses [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html). This can be enabled by setting the environment variable `AWS_ENABLE_POD_ENI` to true via the helm value `controller.env`.

Other extended resources must be registered on nodes by their respective device plugins which are typically installed as DaemonSets (e.g. the `nvidia.com/gpu` resource will be registered by the [NVIDIA device plugin](https://github.com/NVIDIA/k8s-device-plugin). Previously, Karpenter would register these resources on nodes at creation and they would be zeroed out by `kubelet` at startup. By allowing the device plugins to register the resources, pods will not bind to the nodes before any device plugin initialization has occurred.

Expand Down

0 comments on commit bc4564e

Please sign in to comment.