Skip to content

Commit

Permalink
chore: Support v1beta1 in CloudProvider interface (#4585)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis committed Sep 8, 2023
1 parent 40a7ad1 commit f39c1d0
Show file tree
Hide file tree
Showing 37 changed files with 183 additions and 173 deletions.
2 changes: 1 addition & 1 deletion .github/actions/e2e/cleanup/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ runs:
run: |
for name in $(aws iam list-instance-profiles --query "InstanceProfiles[*].{Name:InstanceProfileName}" --output text); do
tags=$(aws iam list-instance-profile-tags --instance-profile-name $name --output json || true)
if [[ $(echo $tags | jq -r '.Tags[] | select(.Key == "testing.karpenter.sh/cluster") | .Value') == "${{ inputs.cluster_name }}" ]]; then
if [[ $(echo $tags | jq -r '.Tags[] | select(.Key == "testing/cluster") | .Value') == "${{ inputs.cluster_name }}" ]]; then
roleName=$(aws iam get-instance-profile --instance-profile-name $name --query "InstanceProfile.Roles[*].{Name:RoleName}" --output text)
aws iam remove-role-from-instance-profile --instance-profile-name $name --role-name $roleName
aws iam delete-instance-profile --instance-profile-name $name
Expand Down
6 changes: 3 additions & 3 deletions .github/actions/e2e/create-cluster/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ runs:
--template-file $CLOUDFORMATION_PATH \
--capabilities CAPABILITY_NAMED_IAM \
--parameter-overrides "ClusterName=${{ inputs.cluster_name }}" \
--tags "testing.karpenter.sh/type=e2e" "github.com/run-url=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" "karpenter.sh/discovery=${{ inputs.cluster_name }}"
--tags "testing/type=e2e" "github.com/run-url=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" "karpenter.sh/discovery=${{ inputs.cluster_name }}"
- name: create or upgrade cluster
shell: bash
run: |
Expand All @@ -76,7 +76,7 @@ runs:
tags:
karpenter.sh/discovery: ${{ inputs.cluster_name }}
github.com/run-url: "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
testing.karpenter.sh/type: "e2e"
testing/type: "e2e"
kubernetesNetworkConfig:
ipFamily: ${{ inputs.ip_family }}
managedNodeGroups:
Expand Down Expand Up @@ -132,7 +132,7 @@ runs:
oidc_id=$(aws eks describe-cluster --name ${{ inputs.cluster_name }} --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 3,4,5)
arn="arn:aws:iam::${{ inputs.account_id }}:oidc-provider/${oidc_id}"
aws iam tag-open-id-connect-provider --open-id-connect-provider-arn $arn \
--tags Key=testing.karpenter.sh/type,Value=e2e Key=github.com/run-url,Value=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
--tags Key=testing/type,Value=e2e Key=github.com/run-url,Value=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
- name: give KarpenterNodeRole permission to bootstrap
shell: bash
run: |
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/avast/retry-go v3.0.0+incompatible
github.com/aws/aws-sdk-go v1.44.328
github.com/aws/karpenter-core v0.30.0
github.com/aws/karpenter-core v0.30.1-0.20230908181031-cbc03e98ef14
github.com/imdario/mergo v0.3.16
github.com/mitchellh/hashstructure/v2 v2.0.2
github.com/onsi/ginkgo/v2 v2.11.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/aws/aws-sdk-go v1.44.328 h1:WBwlf8ym9SDQ/GTIBO9eXyvwappKJyOetWJKl4mT7ZU=
github.com/aws/aws-sdk-go v1.44.328/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/karpenter-core v0.30.0 h1:iEn5D/mvaqPeYNix7JPZyuyELf5Qu2Fx2WP8lFByhDo=
github.com/aws/karpenter-core v0.30.0/go.mod h1:AQl8m8OtgO2N8IlZlzAU6MTrJTJSbe6K4GwdRUNSJVc=
github.com/aws/karpenter-core v0.30.1-0.20230908181031-cbc03e98ef14 h1:sBNA922mx/1UIq2Y1zI21R3/bGs1q9Mj/9xILbFI6zo=
github.com/aws/karpenter-core v0.30.1-0.20230908181031-cbc03e98ef14/go.mod h1:AQl8m8OtgO2N8IlZlzAU6MTrJTJSbe6K4GwdRUNSJVc=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A=
github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
Expand Down
3 changes: 2 additions & 1 deletion hack/docs/instancetypes_gen_docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
coreoperator "github.com/aws/karpenter-core/pkg/operator"
coretest "github.com/aws/karpenter-core/pkg/test"
nodepoolutil "github.com/aws/karpenter-core/pkg/utils/nodepool"
"github.com/aws/karpenter/pkg/apis/settings"
awscloudprovider "github.com/aws/karpenter/pkg/cloudprovider"
"github.com/aws/karpenter/pkg/operator"
Expand Down Expand Up @@ -111,7 +112,7 @@ func main() {
},
},
}
instanceTypes, err := cp.GetInstanceTypes(ctx, prov)
instanceTypes, err := cp.GetInstanceTypes(ctx, nodepoolutil.New(prov))
if err != nil {
log.Fatalf("listing instance types, %s", err)
}
Expand Down
40 changes: 20 additions & 20 deletions pkg/apis/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,37 +51,37 @@ spec:
Node properties are determined from a combination of provisioner and
pod scheduling constraints.
properties:
deprovisioning:
disruption:
default:
consolidateAfter: 15s
consolidationPolicy: WhenUnderutilized
consolidationTTL: 15s
expirationTTL: 90d
description: Deprovisioning contains the parameters that relate to
Karpenter's deprovisioning logic
expireAfter: 720h
description: Disruption contains the parameters that relate to Karpenter's
disruption logic
properties:
consolidateAfter:
default: 15s
description: ConsolidateAfter is the duration the controller will
wait before attempting to terminate nodes that are underutilized.
Refer to ConsolidationPolicy for how underutilization is considered.
pattern: ^(([0-9]+(s|m|h))+)|(Never)$
type: string
consolidationPolicy:
default: WhenUnderutilized
description: ConsolidationPolicy describes which nodes Karpenter
can deprovision through its consolidation algorithm. This policy
can disrupt through its consolidation algorithm. This policy
defaults to "WhenUnderutilized" if not specified
enum:
- Never
- WhenEmpty
- WhenUnderutilized
type: string
consolidationTTL:
default: 15s
description: ConsolidationTTL is the duration the controller will
wait before attempting to terminate nodes that are underutilized.
Refer to ConsolidationPolicy for how underutilization is considered.
type: string
expirationTTL:
default: 90d
description: ExpirationTTL is the duration the controller will
wait before terminating a node, measured from when the node
is created. This is useful to implement features like eventually
consistent node upgrade, memory leak protection, and disruption
testing.
expireAfter:
default: 720h
description: ExpireAfter is the duration the controller will wait
before terminating a node, measured from when the node is created.
This is useful to implement features like eventually consistent
node upgrade, memory leak protection, and disruption testing.
pattern: ^(([0-9]+(s|m|h))+)|(Never)$
type: string
type: object
limits:
Expand Down
99 changes: 52 additions & 47 deletions pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance
}

// Create a machine given the constraints.
func (c *CloudProvider) Create(ctx context.Context, machine *v1alpha5.Machine) (*v1alpha5.Machine, error) {
nodeClaim := nodeclaimutil.New(machine)
func (c *CloudProvider) Create(ctx context.Context, nodeClaim *corev1beta1.NodeClaim) (*corev1beta1.NodeClaim, error) {
nodeClass, err := c.resolveNodeClassFromNodeClaim(ctx, nodeClaim)
if err != nil {
if errors.IsNotFound(err) {
Expand All @@ -114,39 +113,39 @@ func (c *CloudProvider) Create(ctx context.Context, machine *v1alpha5.Machine) (
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
return i.Name == instance.Type
})
m := c.instanceToMachine(instance, instanceType)
m.Annotations = lo.Assign(m.Annotations, nodeclassutil.HashAnnotation(nodeClass))
return m, nil
nc := c.instanceToNodeClaim(instance, instanceType)
nc.Annotations = lo.Assign(nc.Annotations, nodeclassutil.HashAnnotation(nodeClass))
return nc, nil
}

// Link adds a tag to the cloudprovider machine to tell the cloudprovider that it's now owned by a Machine
func (c *CloudProvider) Link(ctx context.Context, machine *v1alpha5.Machine) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("machine", machine.Name))
id, err := utils.ParseInstanceID(machine.Status.ProviderID)
func (c *CloudProvider) Link(ctx context.Context, nodeClaim *corev1beta1.NodeClaim) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With(lo.Ternary(nodeClaim.IsMachine, "machine", "nodeclaim"), nodeClaim.Name))
id, err := utils.ParseInstanceID(nodeClaim.Status.ProviderID)
if err != nil {
return fmt.Errorf("getting instance ID, %w", err)
}
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("id", id))
return c.instanceProvider.Link(ctx, id, machine.Labels[v1alpha5.ProvisionerNameLabelKey])
return c.instanceProvider.Link(ctx, id, nodeClaim.Labels[v1alpha5.ProvisionerNameLabelKey])
}

func (c *CloudProvider) List(ctx context.Context) ([]*v1alpha5.Machine, error) {
func (c *CloudProvider) List(ctx context.Context) ([]*corev1beta1.NodeClaim, error) {
instances, err := c.instanceProvider.List(ctx)
if err != nil {
return nil, fmt.Errorf("listing instances, %w", err)
}
var machines []*v1alpha5.Machine
var nodeClaims []*corev1beta1.NodeClaim
for _, instance := range instances {
instanceType, err := c.resolveInstanceTypeFromInstance(ctx, instance)
if err != nil {
return nil, fmt.Errorf("resolving instance type, %w", err)
}
machines = append(machines, c.instanceToMachine(instance, instanceType))
nodeClaims = append(nodeClaims, c.instanceToNodeClaim(instance, instanceType))
}
return machines, nil
return nodeClaims, nil
}

func (c *CloudProvider) Get(ctx context.Context, providerID string) (*v1alpha5.Machine, error) {
func (c *CloudProvider) Get(ctx context.Context, providerID string) (*corev1beta1.NodeClaim, error) {
id, err := utils.ParseInstanceID(providerID)
if err != nil {
return nil, fmt.Errorf("getting instance ID, %w", err)
Expand All @@ -160,19 +159,18 @@ func (c *CloudProvider) Get(ctx context.Context, providerID string) (*v1alpha5.M
if err != nil {
return nil, fmt.Errorf("resolving instance type, %w", err)
}
return c.instanceToMachine(instance, instanceType), nil
return c.instanceToNodeClaim(instance, instanceType), nil
}

func (c *CloudProvider) LivenessProbe(req *http.Request) error {
return c.instanceTypeProvider.LivenessProbe(req)
}

// GetInstanceTypes returns all available InstanceTypes
func (c *CloudProvider) GetInstanceTypes(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]*cloudprovider.InstanceType, error) {
if provisioner == nil {
func (c *CloudProvider) GetInstanceTypes(ctx context.Context, nodePool *corev1beta1.NodePool) ([]*cloudprovider.InstanceType, error) {
if nodePool == nil {
return c.instanceTypeProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, &v1beta1.NodeClass{})
}
nodePool := nodepoolutil.New(provisioner)
nodeClass, err := c.resolveNodeClassFromNodePool(ctx, nodePool)
if err != nil {
if errors.IsNotFound(err) {
Expand All @@ -188,10 +186,10 @@ func (c *CloudProvider) GetInstanceTypes(ctx context.Context, provisioner *v1alp
return instanceTypes, nil
}

func (c *CloudProvider) Delete(ctx context.Context, machine *v1alpha5.Machine) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("machine", machine.Name))
func (c *CloudProvider) Delete(ctx context.Context, nodeClaim *corev1beta1.NodeClaim) error {
ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With(lo.Ternary(nodeClaim.IsMachine, "machine", "nodeclaim"), nodeClaim.Name))

providerID := lo.Ternary(machine.Status.ProviderID != "", machine.Status.ProviderID, machine.Annotations[v1alpha5.MachineLinkedAnnotationKey])
providerID := lo.Ternary(nodeClaim.Status.ProviderID != "", nodeClaim.Status.ProviderID, nodeClaim.Annotations[v1alpha5.MachineLinkedAnnotationKey])
id, err := utils.ParseInstanceID(providerID)
if err != nil {
return fmt.Errorf("getting instance ID, %w", err)
Expand All @@ -200,8 +198,7 @@ func (c *CloudProvider) Delete(ctx context.Context, machine *v1alpha5.Machine) e
return c.instanceProvider.Delete(ctx, id)
}

func (c *CloudProvider) IsMachineDrifted(ctx context.Context, machine *v1alpha5.Machine) (cloudprovider.DriftReason, error) {
nodeClaim := nodeclaimutil.New(machine)
func (c *CloudProvider) IsDrifted(ctx context.Context, nodeClaim *corev1beta1.NodeClaim) (cloudprovider.DriftReason, error) {
// Not needed when GetInstanceTypes removes nodepool dependency
nodePool, err := nodeclaimutil.Owner(ctx, c.kubeClient, nodeClaim)
if err != nil {
Expand Down Expand Up @@ -291,14 +288,14 @@ func (c *CloudProvider) resolveInstanceTypes(ctx context.Context, nodeClaim *cor
}
reqs := scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...)
return lo.Filter(instanceTypes, func(i *cloudprovider.InstanceType, _ int) bool {
return reqs.Compatible(i.Requirements) == nil &&
return reqs.Compatible(i.Requirements, lo.Ternary(nodeClaim.IsMachine, scheduling.AllowUndefinedWellKnownLabelsV1Alpha5, scheduling.AllowUndefinedWellKnownLabelsV1Beta1)) == nil &&
len(i.Offerings.Requirements(reqs).Available()) > 0 &&
resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable())
}), nil
}

func (c *CloudProvider) resolveInstanceTypeFromInstance(ctx context.Context, instance *instance.Instance) (*cloudprovider.InstanceType, error) {
provisioner, err := c.resolveProvisionerFromInstance(ctx, instance)
provisioner, err := c.resolveNodePoolFromInstance(ctx, instance)
if err != nil {
// If we can't resolve the provisioner, we fall back to not getting instance type info
return nil, client.IgnoreNotFound(fmt.Errorf("resolving provisioner, %w", err))
Expand All @@ -314,20 +311,30 @@ func (c *CloudProvider) resolveInstanceTypeFromInstance(ctx context.Context, ins
return instanceType, nil
}

func (c *CloudProvider) resolveProvisionerFromInstance(ctx context.Context, instance *instance.Instance) (*v1alpha5.Provisioner, error) {
provisioner := &v1alpha5.Provisioner{}
provisionerName, ok := instance.Tags[v1alpha5.ProvisionerNameLabelKey]
if !ok {
return nil, errors.NewNotFound(schema.GroupResource{Group: v1alpha5.Group, Resource: "Provisioner"}, "")
}
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: provisionerName}, provisioner); err != nil {
return nil, err
func (c *CloudProvider) resolveNodePoolFromInstance(ctx context.Context, instance *instance.Instance) (*corev1beta1.NodePool, error) {
provisionerName := instance.Tags[v1alpha5.ProvisionerNameLabelKey]
nodePoolName := instance.Tags[corev1beta1.NodePoolLabelKey]

switch {
case nodePoolName != "":
nodePool := &corev1beta1.NodePool{}
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: nodePoolName}, nodePool); err != nil {
return nil, err
}
return nodePool, nil
case provisionerName != "":
provisioner := &v1alpha5.Provisioner{}
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: provisionerName}, provisioner); err != nil {
return nil, err
}
return nodepoolutil.New(provisioner), nil
default:
return nil, errors.NewNotFound(schema.GroupResource{Group: corev1beta1.Group, Resource: "NodePool"}, "")
}
return provisioner, nil
}

func (c *CloudProvider) instanceToMachine(i *instance.Instance, instanceType *cloudprovider.InstanceType) *v1alpha5.Machine {
machine := &v1alpha5.Machine{}
func (c *CloudProvider) instanceToNodeClaim(i *instance.Instance, instanceType *cloudprovider.InstanceType) *corev1beta1.NodeClaim {
nodeClaim := &corev1beta1.NodeClaim{}
labels := map[string]string{}
annotations := map[string]string{}

Expand All @@ -337,27 +344,25 @@ func (c *CloudProvider) instanceToMachine(i *instance.Instance, instanceType *cl
labels[key] = req.Values()[0]
}
}
machine.Status.Capacity = functional.FilterMap(instanceType.Capacity, func(_ v1.ResourceName, v resource.Quantity) bool { return !resources.IsZero(v) })
machine.Status.Allocatable = functional.FilterMap(instanceType.Allocatable(), func(_ v1.ResourceName, v resource.Quantity) bool { return !resources.IsZero(v) })
nodeClaim.Status.Capacity = functional.FilterMap(instanceType.Capacity, func(_ v1.ResourceName, v resource.Quantity) bool { return !resources.IsZero(v) })
nodeClaim.Status.Allocatable = functional.FilterMap(instanceType.Allocatable(), func(_ v1.ResourceName, v resource.Quantity) bool { return !resources.IsZero(v) })
}
labels[v1.LabelTopologyZone] = i.Zone
labels[corev1beta1.CapacityTypeLabelKey] = i.CapacityType
if v, ok := i.Tags[v1alpha5.ProvisionerNameLabelKey]; ok {
labels[v1alpha5.ProvisionerNameLabelKey] = v
}
if v, ok := i.Tags[corev1beta1.NodePoolLabelKey]; ok {
labels[corev1beta1.NodePoolLabelKey] = v
nodeClaim.IsMachine = true
}
if v, ok := i.Tags[corev1beta1.ManagedByAnnotationKey]; ok {
annotations[corev1beta1.ManagedByAnnotationKey] = v
}
machine.Labels = labels
machine.Annotations = annotations
machine.CreationTimestamp = metav1.Time{Time: i.LaunchTime}
nodeClaim.Labels = labels
nodeClaim.Annotations = annotations
nodeClaim.CreationTimestamp = metav1.Time{Time: i.LaunchTime}
// Set the deletionTimestamp to be the current time if the instance is currently terminating
if i.State == ec2.InstanceStateNameShuttingDown || i.State == ec2.InstanceStateNameTerminated {
machine.DeletionTimestamp = &metav1.Time{Time: time.Now()}
nodeClaim.DeletionTimestamp = &metav1.Time{Time: time.Now()}
}
machine.Status.ProviderID = fmt.Sprintf("aws:///%s/%s", i.Zone, i.ID)
return machine
nodeClaim.Status.ProviderID = fmt.Sprintf("aws:///%s/%s", i.Zone, i.ID)
return nodeClaim
}
5 changes: 2 additions & 3 deletions pkg/cloudprovider/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (

corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1"
"github.com/aws/karpenter-core/pkg/cloudprovider"
provisionerutil "github.com/aws/karpenter-core/pkg/utils/provisioner"
"github.com/aws/karpenter-core/pkg/utils/sets"
"github.com/aws/karpenter/pkg/apis/v1alpha1"
"github.com/aws/karpenter/pkg/apis/v1beta1"
Expand Down Expand Up @@ -64,7 +63,7 @@ func (c *CloudProvider) isNodeClassDrifted(ctx context.Context, nodeClaim *corev

func (c *CloudProvider) isAMIDrifted(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodePool *corev1beta1.NodePool,
instance *instance.Instance, nodeClass *v1beta1.NodeClass) (cloudprovider.DriftReason, error) {
instanceTypes, err := c.GetInstanceTypes(ctx, provisionerutil.New(nodePool))
instanceTypes, err := c.GetInstanceTypes(ctx, nodePool)
if err != nil {
return "", fmt.Errorf("getting instanceTypes, %w", err)
}
Expand All @@ -84,7 +83,7 @@ func (c *CloudProvider) isAMIDrifted(ctx context.Context, nodeClaim *corev1beta1
if len(amis) == 0 {
return "", fmt.Errorf("no amis exist given constraints")
}
mappedAMIs := amis.MapToInstanceTypes([]*cloudprovider.InstanceType{nodeInstanceType})
mappedAMIs := amis.MapToInstanceTypes([]*cloudprovider.InstanceType{nodeInstanceType}, nodeClaim.IsMachine)
if len(mappedAMIs) == 0 {
return "", fmt.Errorf("no instance types satisfy requirements of amis %v", amis)
}
Expand Down

0 comments on commit f39c1d0

Please sign in to comment.