Skip to content

Commit

Permalink
chore: Enable v1beta1/NodeClass conversion for CloudProvider (#4503)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis committed Sep 1, 2023
1 parent 44750f2 commit 3696509
Show file tree
Hide file tree
Showing 30 changed files with 1,170 additions and 1,019 deletions.
104 changes: 72 additions & 32 deletions pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,16 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime/schema"

corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1"
"github.com/aws/karpenter-core/pkg/events"
"github.com/aws/karpenter-core/pkg/utils/functional"
machineutil "github.com/aws/karpenter-core/pkg/utils/machine"
nodepoolutil "github.com/aws/karpenter-core/pkg/utils/nodepool"
"github.com/aws/karpenter/pkg/apis"
"github.com/aws/karpenter/pkg/apis/v1alpha1"
"github.com/aws/karpenter/pkg/apis/v1beta1"
"github.com/aws/karpenter/pkg/utils"
nodeclassutil "github.com/aws/karpenter/pkg/utils/nodeclass"

"github.com/aws/karpenter-core/pkg/scheduling"
"github.com/aws/karpenter-core/pkg/utils/resources"
Expand All @@ -41,6 +46,7 @@ import (
"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/client"

nodeclaimutil "github.com/aws/karpenter-core/pkg/utils/nodeclaim"
cloudproviderevents "github.com/aws/karpenter/pkg/cloudprovider/events"
"github.com/aws/karpenter/pkg/providers/amifamily"
"github.com/aws/karpenter/pkg/providers/instance"
Expand Down Expand Up @@ -85,33 +91,30 @@ func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance

// Create a machine given the constraints.
func (c *CloudProvider) Create(ctx context.Context, machine *v1alpha5.Machine) (*v1alpha5.Machine, error) {
nodeTemplate, err := c.resolveNodeTemplate(ctx, []byte(machine.
Annotations[v1alpha5.ProviderCompatabilityAnnotationKey]), machine.
Spec.MachineTemplateRef)
nodeClaim := nodeclaimutil.New(machine)
nodeClass, err := c.resolveNodeClassFromNodeClaim(ctx, nodeClaim)
if err != nil {
if errors.IsNotFound(err) {
c.recorder.Publish(cloudproviderevents.MachineFailedToResolveNodeTemplate(machine))
c.recorder.Publish(cloudproviderevents.NodeClaimFailedToResolveNodeClass(nodeClaim))
}
return nil, fmt.Errorf("resolving node template, %w", err)
return nil, fmt.Errorf("resolving node class, %w", err)
}
instanceTypes, err := c.resolveInstanceTypes(ctx, machine, nodeTemplate)
instanceTypes, err := c.resolveInstanceTypes(ctx, nodeClaim, nodeClass)
if err != nil {
return nil, fmt.Errorf("resolving instance types, %w", err)
}
if len(instanceTypes) == 0 {
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch"))
}
instance, err := c.instanceProvider.Create(ctx, nodeTemplate, machine, instanceTypes)
instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, instanceTypes)
if err != nil {
return nil, fmt.Errorf("creating instance, %w", err)
}
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
return i.Name == instance.Type
})
m := c.instanceToMachine(instance, instanceType)
m.Annotations = lo.Assign(m.Annotations, map[string]string{
v1alpha1.AnnotationNodeTemplateHash: nodeTemplate.Hash(),
})
m.Annotations = lo.Assign(m.Annotations, nodeclassutil.HashAnnotation(nodeClass))
return m, nil
}

Expand Down Expand Up @@ -166,21 +169,18 @@ func (c *CloudProvider) LivenessProbe(req *http.Request) error {
// GetInstanceTypes returns all available InstanceTypes
func (c *CloudProvider) GetInstanceTypes(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]*cloudprovider.InstanceType, error) {
if provisioner == nil {
return c.instanceTypeProvider.List(ctx, &v1alpha5.KubeletConfiguration{}, &v1alpha1.AWSNodeTemplate{})
}
var rawProvider []byte
if provisioner.Spec.Provider != nil {
rawProvider = provisioner.Spec.Provider.Raw
return c.instanceTypeProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, &v1beta1.NodeClass{})
}
nodeTemplate, err := c.resolveNodeTemplate(ctx, rawProvider, provisioner.Spec.ProviderRef)
nodePool := nodepoolutil.New(provisioner)
nodeClass, err := c.resolveNodeClassFromNodePool(ctx, nodePool)
if err != nil {
if errors.IsNotFound(err) {
c.recorder.Publish(cloudproviderevents.ProvisionerFailedToResolveNodeTemplate(provisioner))
c.recorder.Publish(cloudproviderevents.NodePoolFailedToResolveNodeClass(nodePool))
}
return nil, client.IgnoreNotFound(err)
return nil, client.IgnoreNotFound(fmt.Errorf("resolving node class, %w", err))
}
// TODO, break this coupling
instanceTypes, err := c.instanceTypeProvider.List(ctx, provisioner.Spec.KubeletConfiguration, nodeTemplate)
instanceTypes, err := c.instanceTypeProvider.List(ctx, nodePool.Spec.Template.Spec.KubeletConfiguration, nodeClass)
if err != nil {
return nil, err
}
Expand All @@ -200,22 +200,23 @@ func (c *CloudProvider) Delete(ctx context.Context, machine *v1alpha5.Machine) e
}

func (c *CloudProvider) IsMachineDrifted(ctx context.Context, machine *v1alpha5.Machine) (cloudprovider.DriftReason, error) {
// Not needed when GetInstanceTypes removes provisioner dependency
provisioner := &v1alpha5.Provisioner{}
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: machine.Labels[v1alpha5.ProvisionerNameLabelKey]}, provisioner); err != nil {
return "", client.IgnoreNotFound(fmt.Errorf("getting provisioner, %w", err))
nodeClaim := nodeclaimutil.New(machine)
// Not needed when GetInstanceTypes removes nodepool dependency
nodePool, err := nodeclaimutil.Owner(ctx, c.kubeClient, nodeClaim)
if err != nil {
return "", client.IgnoreNotFound(fmt.Errorf("resolving owner, %w", err))
}
if provisioner.Spec.ProviderRef == nil {
if nodePool.Spec.Template.Spec.NodeClass == nil {
return "", nil
}
nodeTemplate, err := c.resolveNodeTemplate(ctx, nil, provisioner.Spec.ProviderRef)
nodeClass, err := c.resolveNodeClassFromNodePool(ctx, nodePool)
if err != nil {
if errors.IsNotFound(err) {
c.recorder.Publish(cloudproviderevents.ProvisionerFailedToResolveNodeTemplate(provisioner))
c.recorder.Publish(cloudproviderevents.NodePoolFailedToResolveNodeClass(nodePool))
}
return "", client.IgnoreNotFound(fmt.Errorf("resolving node template, %w", err))
return "", client.IgnoreNotFound(fmt.Errorf("resolving node class, %w", err))
}
driftReason, err := c.isNodeTemplateDrifted(ctx, machine, provisioner, nodeTemplate)
driftReason, err := c.isNodeClassDrifted(ctx, nodeClaim, nodePool, nodeClass)
if err != nil {
return "", err
}
Expand All @@ -227,6 +228,45 @@ func (c *CloudProvider) Name() string {
return "aws"
}

func (c *CloudProvider) resolveNodeClassFromNodeClaim(ctx context.Context, nodeClaim *corev1beta1.NodeClaim) (*v1beta1.NodeClass, error) {
// TODO @joinnis: Remove this handling for Machine resolution when we remove v1alpha5
if nodeClaim.IsMachine {
nodeTemplate, err := c.resolveNodeTemplate(ctx,
[]byte(nodeClaim.Annotations[v1alpha5.ProviderCompatabilityAnnotationKey]),
machineutil.NewMachineTemplateRef(nodeClaim.Spec.NodeClass))
if err != nil {
return nil, fmt.Errorf("resolving node template, %w", err)
}
return nodeclassutil.New(nodeTemplate), nil
}
nodeClass := &v1beta1.NodeClass{}
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: nodeClaim.Spec.NodeClass.Name}, nodeClass); err != nil {
return nil, err
}
return nodeClass, nil
}

func (c *CloudProvider) resolveNodeClassFromNodePool(ctx context.Context, nodePool *corev1beta1.NodePool) (*v1beta1.NodeClass, error) {
// TODO @joinnis: Remove this handling for Provisioner resolution when we remove v1alpha5
if nodePool.IsProvisioner {
var rawProvider []byte
if nodePool.Spec.Template.Spec.Provider != nil {
rawProvider = nodePool.Spec.Template.Spec.Provider.Raw
}
nodeTemplate, err := c.resolveNodeTemplate(ctx, rawProvider, machineutil.NewMachineTemplateRef(nodePool.Spec.Template.Spec.NodeClass))
if err != nil {
return nil, fmt.Errorf("resolving node template, %w", err)
}
return nodeclassutil.New(nodeTemplate), nil
}
nodeClass := &v1beta1.NodeClass{}
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: nodePool.Spec.Template.Spec.NodeClass.Name}, nodeClass); err != nil {
return nil, err
}
return nodeClass, nil
}

// TODO @joinnis: Remove this handling for NodeTemplate resolution when we remove v1alpha5
func (c *CloudProvider) resolveNodeTemplate(ctx context.Context, raw []byte, objRef *v1alpha5.MachineTemplateRef) (*v1alpha1.AWSNodeTemplate, error) {
nodeTemplate := &v1alpha1.AWSNodeTemplate{}
if objRef != nil {
Expand All @@ -243,16 +283,16 @@ func (c *CloudProvider) resolveNodeTemplate(ctx context.Context, raw []byte, obj
return nodeTemplate, nil
}

func (c *CloudProvider) resolveInstanceTypes(ctx context.Context, machine *v1alpha5.Machine, nodeTemplate *v1alpha1.AWSNodeTemplate) ([]*cloudprovider.InstanceType, error) {
instanceTypes, err := c.instanceTypeProvider.List(ctx, machine.Spec.Kubelet, nodeTemplate)
func (c *CloudProvider) resolveInstanceTypes(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodeClass *v1beta1.NodeClass) ([]*cloudprovider.InstanceType, error) {
instanceTypes, err := c.instanceTypeProvider.List(ctx, nodeClaim.Spec.KubeletConfiguration, nodeClass)
if err != nil {
return nil, fmt.Errorf("getting instance types, %w", err)
}
reqs := scheduling.NewNodeSelectorRequirements(machine.Spec.Requirements...)
reqs := scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...)
return lo.Filter(instanceTypes, func(i *cloudprovider.InstanceType, _ int) bool {
return reqs.Compatible(i.Requirements) == nil &&
len(i.Offerings.Requirements(reqs).Available()) > 0 &&
resources.Fits(machine.Spec.Resources.Requests, i.Allocatable())
resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable())
}), nil
}

Expand Down
60 changes: 34 additions & 26 deletions pkg/cloudprovider/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ import (
"github.com/samber/lo"
v1 "k8s.io/api/core/v1"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1"
"github.com/aws/karpenter-core/pkg/cloudprovider"
provisionerutil "github.com/aws/karpenter-core/pkg/utils/provisioner"
"github.com/aws/karpenter-core/pkg/utils/sets"
"github.com/aws/karpenter/pkg/apis/v1alpha1"
"github.com/aws/karpenter/pkg/apis/v1beta1"
"github.com/aws/karpenter/pkg/providers/amifamily"
"github.com/aws/karpenter/pkg/providers/instance"
"github.com/aws/karpenter/pkg/utils"
Expand All @@ -37,45 +39,45 @@ const (
NodeTemplateDrift cloudprovider.DriftReason = "NodeTemplateDrift"
)

func (c *CloudProvider) isNodeTemplateDrifted(ctx context.Context, machine *v1alpha5.Machine, provisioner *v1alpha5.Provisioner, nodeTemplate *v1alpha1.AWSNodeTemplate) (cloudprovider.DriftReason, error) {
instance, err := c.getInstance(ctx, machine.Status.ProviderID)
func (c *CloudProvider) isNodeClassDrifted(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodePool *corev1beta1.NodePool, nodeClass *v1beta1.NodeClass) (cloudprovider.DriftReason, error) {
instance, err := c.getInstance(ctx, nodeClaim.Status.ProviderID)
if err != nil {
return "", err
}
amiDrifted, err := c.isAMIDrifted(ctx, machine, provisioner, instance, nodeTemplate)
amiDrifted, err := c.isAMIDrifted(ctx, nodeClaim, nodePool, instance, nodeClass)
if err != nil {
return "", fmt.Errorf("calculating ami drift, %w", err)
}
securitygroupDrifted, err := c.areSecurityGroupsDrifted(instance, nodeTemplate)
securitygroupDrifted, err := c.areSecurityGroupsDrifted(instance, nodeClass)
if err != nil {
return "", fmt.Errorf("calculating securitygroup drift, %w", err)
}
subnetDrifted, err := c.isSubnetDrifted(instance, nodeTemplate)
subnetDrifted, err := c.isSubnetDrifted(instance, nodeClass)
if err != nil {
return "", fmt.Errorf("calculating subnet drift, %w", err)
}
drifted := lo.FindOrElse([]cloudprovider.DriftReason{amiDrifted, securitygroupDrifted, subnetDrifted, c.areStaticFieldsDrifted(machine, nodeTemplate)}, "", func(i cloudprovider.DriftReason) bool {
drifted := lo.FindOrElse([]cloudprovider.DriftReason{amiDrifted, securitygroupDrifted, subnetDrifted, c.areStaticFieldsDrifted(nodeClaim, nodeClass)}, "", func(i cloudprovider.DriftReason) bool {
return string(i) != ""
})
return drifted, nil
}

func (c *CloudProvider) isAMIDrifted(ctx context.Context, machine *v1alpha5.Machine, provisioner *v1alpha5.Provisioner,
instance *instance.Instance, nodeTemplate *v1alpha1.AWSNodeTemplate) (cloudprovider.DriftReason, error) {
instanceTypes, err := c.GetInstanceTypes(ctx, provisioner)
func (c *CloudProvider) isAMIDrifted(ctx context.Context, nodeClaim *corev1beta1.NodeClaim, nodePool *corev1beta1.NodePool,
instance *instance.Instance, nodeClass *v1beta1.NodeClass) (cloudprovider.DriftReason, error) {
instanceTypes, err := c.GetInstanceTypes(ctx, provisionerutil.New(nodePool))
if err != nil {
return "", fmt.Errorf("getting instanceTypes, %w", err)
}
nodeInstanceType, found := lo.Find(instanceTypes, func(instType *cloudprovider.InstanceType) bool {
return instType.Name == machine.Labels[v1.LabelInstanceTypeStable]
return instType.Name == nodeClaim.Labels[v1.LabelInstanceTypeStable]
})
if !found {
return "", fmt.Errorf(`finding node instance type "%s"`, machine.Labels[v1.LabelInstanceTypeStable])
return "", fmt.Errorf(`finding node instance type "%s"`, nodeClaim.Labels[v1.LabelInstanceTypeStable])
}
if nodeTemplate.Spec.LaunchTemplateName != nil {
if nodeClass.Spec.LaunchTemplateName != nil {
return "", nil
}
amis, err := c.amiProvider.Get(ctx, nodeTemplate, &amifamily.Options{})
amis, err := c.amiProvider.Get(ctx, nodeClass, &amifamily.Options{})
if err != nil {
return "", fmt.Errorf("getting amis, %w", err)
}
Expand All @@ -92,12 +94,12 @@ func (c *CloudProvider) isAMIDrifted(ctx context.Context, machine *v1alpha5.Mach
return "", nil
}

func (c *CloudProvider) isSubnetDrifted(instance *instance.Instance, nodeTemplate *v1alpha1.AWSNodeTemplate) (cloudprovider.DriftReason, error) {
func (c *CloudProvider) isSubnetDrifted(instance *instance.Instance, nodeClass *v1beta1.NodeClass) (cloudprovider.DriftReason, error) {
// If the node template status does not have subnets, wait for the subnets to be populated before continuing
if nodeTemplate.Status.Subnets == nil {
if len(nodeClass.Status.Subnets) == 0 {
return "", fmt.Errorf("AWSNodeTemplate has no subnets")
}
_, found := lo.Find(nodeTemplate.Status.Subnets, func(subnet v1alpha1.Subnet) bool {
_, found := lo.Find(nodeClass.Status.Subnets, func(subnet v1beta1.Subnet) bool {
return subnet.ID == instance.SubnetID
})
if !found {
Expand All @@ -108,13 +110,13 @@ func (c *CloudProvider) isSubnetDrifted(instance *instance.Instance, nodeTemplat

// Checks if the security groups are drifted, by comparing the AWSNodeTemplate.Status.SecurityGroups
// to the ec2 instance security groups
func (c *CloudProvider) areSecurityGroupsDrifted(ec2Instance *instance.Instance, nodeTemplate *v1alpha1.AWSNodeTemplate) (cloudprovider.DriftReason, error) {
// nodeTemplate.Spec.SecurityGroupSelector can be nil if the user is using a launchTemplateName to define SecurityGroups
func (c *CloudProvider) areSecurityGroupsDrifted(ec2Instance *instance.Instance, nodeClass *v1beta1.NodeClass) (cloudprovider.DriftReason, error) {
// nodeClass.Spec.SecurityGroupSelector can be nil if the user is using a launchTemplateName to define SecurityGroups
// Karpenter will not drift on changes to securitygroup in the launchTemplateName
if nodeTemplate.Spec.LaunchTemplateName != nil {
if nodeClass.Spec.LaunchTemplateName != nil {
return "", nil
}
securityGroupIds := sets.New(lo.Map(nodeTemplate.Status.SecurityGroups, func(sg v1alpha1.SecurityGroup, _ int) string { return sg.ID })...)
securityGroupIds := sets.New(lo.Map(nodeClass.Status.SecurityGroups, func(sg v1beta1.SecurityGroup, _ int) string { return sg.ID })...)
if len(securityGroupIds) == 0 {
return "", fmt.Errorf("no security groups exist in the AWSNodeTemplate Status")
}
Expand All @@ -125,13 +127,19 @@ func (c *CloudProvider) areSecurityGroupsDrifted(ec2Instance *instance.Instance,
return "", nil
}

func (c *CloudProvider) areStaticFieldsDrifted(machine *v1alpha5.Machine, nodeTemplate *v1alpha1.AWSNodeTemplate) cloudprovider.DriftReason {
nodeTemplateHash, foundHashNodeTemplate := nodeTemplate.ObjectMeta.Annotations[v1alpha1.AnnotationNodeTemplateHash]
machineHash, foundHashMachine := machine.ObjectMeta.Annotations[v1alpha1.AnnotationNodeTemplateHash]
if !foundHashNodeTemplate || !foundHashMachine {
func (c *CloudProvider) areStaticFieldsDrifted(nodeClaim *corev1beta1.NodeClaim, nodeClass *v1beta1.NodeClass) cloudprovider.DriftReason {
var ownerHashKey string
if nodeClaim.IsMachine {
ownerHashKey = v1alpha1.AnnotationNodeTemplateHash
} else {
ownerHashKey = v1beta1.AnnotationNodeClassHash
}
nodeClassHash, foundHashNodeClass := nodeClass.Annotations[ownerHashKey]
nodeClaimHash, foundHashNodeClaim := nodeClaim.Annotations[ownerHashKey]
if !foundHashNodeClass || !foundHashNodeClaim {
return ""
}
if nodeTemplateHash != machineHash {
if nodeClassHash != nodeClaimHash {
return NodeTemplateDrift
}
return ""
Expand Down
38 changes: 29 additions & 9 deletions pkg/cloudprovider/events/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,44 @@ package events
import (
v1 "k8s.io/api/core/v1"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/apis/v1beta1"
"github.com/aws/karpenter-core/pkg/events"
machineutil "github.com/aws/karpenter-core/pkg/utils/machine"
provisionerutil "github.com/aws/karpenter-core/pkg/utils/provisioner"
)

func ProvisionerFailedToResolveNodeTemplate(provisioner *v1alpha5.Provisioner) events.Event {
func NodePoolFailedToResolveNodeClass(nodePool *v1beta1.NodePool) events.Event {
if nodePool.IsProvisioner {
provisioner := provisionerutil.New(nodePool)
return events.Event{
InvolvedObject: provisioner,
Type: v1.EventTypeWarning,
Message: "Failed resolving AWSNodeTemplate",
DedupeValues: []string{string(provisioner.UID)},
}
}
return events.Event{
InvolvedObject: provisioner,
InvolvedObject: nodePool,
Type: v1.EventTypeWarning,
Message: "Failed resolving AWSNodeTemplate",
DedupeValues: []string{string(provisioner.UID)},
Message: "Failed resolving NodeClass",
DedupeValues: []string{string(nodePool.UID)},
}
}

func MachineFailedToResolveNodeTemplate(machine *v1alpha5.Machine) events.Event {
func NodeClaimFailedToResolveNodeClass(nodeClaim *v1beta1.NodeClaim) events.Event {
if nodeClaim.IsMachine {
machine := machineutil.NewFromNodeClaim(nodeClaim)
return events.Event{
InvolvedObject: machine,
Type: v1.EventTypeWarning,
Message: "Failed resolving AWSNodeTemplate",
DedupeValues: []string{string(machine.UID)},
}
}
return events.Event{
InvolvedObject: machine,
InvolvedObject: nodeClaim,
Type: v1.EventTypeWarning,
Message: "Failed resolving AWSNodeTemplate",
DedupeValues: []string{string(machine.UID)},
Message: "Failed resolving NodeClass",
DedupeValues: []string{string(nodeClaim.UID)},
}
}

0 comments on commit 3696509

Please sign in to comment.