Skip to content

Commit

Permalink
Select GPU variant of eks optimized ami for nvidia & neuron (#684)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jacob Gabrielson committed Sep 21, 2021
1 parent 0ac64eb commit 903fffc
Show file tree
Hide file tree
Showing 8 changed files with 321 additions and 26 deletions.
260 changes: 260 additions & 0 deletions go.sum

Large diffs are not rendered by default.

27 changes: 20 additions & 7 deletions pkg/cloudprovider/aws/ami.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/aws/aws-sdk-go/service/ssm"
"github.com/aws/aws-sdk-go/service/ssm/ssmiface"
"github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha4"
"github.com/awslabs/karpenter/pkg/cloudprovider"
v1alpha1 "github.com/awslabs/karpenter/pkg/cloudprovider/aws/apis/v1alpha1"
"github.com/patrickmn/go-cache"
"k8s.io/client-go/kubernetes"
Expand All @@ -45,20 +46,32 @@ func NewAMIProvider(ssm ssmiface.SSMAPI, clientSet *kubernetes.Clientset) *AMIPr
}
}

func (p *AMIProvider) Get(ctx context.Context, constraints *v1alpha1.Constraints) (string, error) {
func (p *AMIProvider) getSSMParameter(ctx context.Context, constraints *v1alpha1.Constraints, instanceTypes []cloudprovider.InstanceType) (string, error) {
version, err := p.kubeServerVersion(ctx)
if err != nil {
return "", fmt.Errorf("kube server version, %w", err)
}
architecture := v1alpha4.ArchitectureAmd64 // default to amd
var amiNameSuffix string
if len(constraints.Architectures) > 0 {
architecture = constraints.Architectures[0] // select the first one if multiple supported
// select the first one if multiple supported
if constraints.Architectures[0] == v1alpha4.ArchitectureArm64 {
amiNameSuffix = "-arm64"
}
}
var architectureSuffix string
if architecture == v1alpha4.ArchitectureArm64 {
architectureSuffix = "-arm64"
if needsGPUAmi(instanceTypes) {
if amiNameSuffix != "" {
return "", fmt.Errorf("no amazon-linux-2 ami available for both nvidia/neuron gpus and arm64 cpus")
}
amiNameSuffix = "-gpu"
}
return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/amazon-linux-2%s/recommended/image_id", version, amiNameSuffix), nil
}

func (p *AMIProvider) Get(ctx context.Context, constraints *v1alpha1.Constraints, instanceTypes []cloudprovider.InstanceType) (string, error) {
name, err := p.getSSMParameter(ctx, constraints, instanceTypes)
if err != nil {
return "", err
}
name := fmt.Sprintf("/aws/service/eks/optimized-ami/%s/amazon-linux-2%s/recommended/image_id", version, architectureSuffix)
if id, ok := p.cache.Get(name); ok {
return id.(string), nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (a *AWS) validateCapacityType(ctx context.Context) (errs *apis.FieldError)
return errs
}

func (a *AWS) validateInstanceProfile(ctx context.Context)(errs *apis.FieldError) {
func (a *AWS) validateInstanceProfile(ctx context.Context) (errs *apis.FieldError) {
if a.InstanceProfile == "" {
errs = errs.Also(apis.ErrMissingField("instanceProfile"))
}
Expand Down
3 changes: 1 addition & 2 deletions pkg/cloudprovider/aws/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ func withUserAgent(sess *session.Session) *session.Session {
}

// Create a node given the constraints.

func (c *CloudProvider) Create(ctx context.Context, constraints *v1alpha4.Constraints, instanceTypes []cloudprovider.InstanceType, callback func(*v1.Node) error) chan error {
return c.creationQueue.Add(func() error {
return c.create(ctx, constraints, instanceTypes, callback)
Expand All @@ -139,7 +138,7 @@ func (c *CloudProvider) create(ctx context.Context, v1alpha4constraints *v1alpha
return fmt.Errorf("getting subnets, %w", err)
}
// 2. Get Launch Template
launchTemplate, err := c.launchTemplateProvider.Get(ctx, constraints)
launchTemplate, err := c.launchTemplateProvider.Get(ctx, constraints, instanceTypes)
if err != nil {
return fmt.Errorf("getting launch template, %w", err)
}
Expand Down
45 changes: 34 additions & 11 deletions pkg/cloudprovider/aws/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
"github.com/awslabs/karpenter/pkg/cloudprovider"
v1alpha1 "github.com/awslabs/karpenter/pkg/cloudprovider/aws/apis/v1alpha1"
"github.com/awslabs/karpenter/pkg/utils/restconfig"
"github.com/mitchellh/hashstructure/v2"
Expand Down Expand Up @@ -76,19 +77,14 @@ type launchTemplateOptions struct {
AMIID string
}

type LaunchTemplate struct {
Name string
Version string
}

func (p *LaunchTemplateProvider) Get(ctx context.Context, constraints *v1alpha1.Constraints) (string, error) {
func (p *LaunchTemplateProvider) Get(ctx context.Context, constraints *v1alpha1.Constraints, instanceTypes []cloudprovider.InstanceType) (string, error) {
// 1. If the customer specified a launch template then just use it
if constraints.LaunchTemplate != nil {
return ptr.StringValue(constraints.LaunchTemplate), nil
}

// 2. Get constrained AMI ID
amiID, err := p.amiProvider.Get(ctx, constraints)
amiID, err := p.amiProvider.Get(ctx, constraints, instanceTypes)
if err != nil {
return "", err
}
Expand All @@ -100,7 +96,7 @@ func (p *LaunchTemplateProvider) Get(ctx context.Context, constraints *v1alpha1.
}

// 3. Get userData for Node
userData, err := p.getUserData(ctx, constraints)
userData, err := p.getUserData(ctx, constraints, instanceTypes)
if err != nil {
return "", err
}
Expand Down Expand Up @@ -149,6 +145,28 @@ func (p *LaunchTemplateProvider) ensureLaunchTemplate(ctx context.Context, optio
return launchTemplate, nil
}

func needsGPUAmi(is []cloudprovider.InstanceType) bool {
for _, i := range is {
if !i.NvidiaGPUs().IsZero() || !i.AWSNeurons().IsZero() {
return true
}
}
return false
}

// needsDocker returns true if the instance type is unable to use
// conatinerd directly
func needsDocker(is []cloudprovider.InstanceType) bool {
for _, i := range is {
// This function can be removed once containerd support for
// Neurons is in the EKS Optimized AMI
if !i.AWSNeurons().IsZero() {
return true
}
}
return false
}

func (p *LaunchTemplateProvider) createLaunchTemplate(ctx context.Context, options *launchTemplateOptions) (*ec2.LaunchTemplate, error) {
output, err := p.ec2api.CreateLaunchTemplateWithContext(ctx, &ec2.CreateLaunchTemplateInput{
LaunchTemplateName: aws.String(launchTemplateName(options)),
Expand Down Expand Up @@ -185,13 +203,18 @@ func (p *LaunchTemplateProvider) createLaunchTemplate(ctx context.Context, optio
return output.LaunchTemplate, nil
}

func (p *LaunchTemplateProvider) getUserData(ctx context.Context, constraints *v1alpha1.Constraints) (string, error) {
func (p *LaunchTemplateProvider) getUserData(ctx context.Context, constraints *v1alpha1.Constraints, instanceTypes []cloudprovider.InstanceType) (string, error) {
var containerRuntimeArg string
if !needsDocker(instanceTypes) {
containerRuntimeArg = "--container-runtime containerd"
}

var userData bytes.Buffer
userData.WriteString(fmt.Sprintf(`#!/bin/bash
/etc/eks/bootstrap.sh '%s' \
--container-runtime containerd \
/etc/eks/bootstrap.sh '%s' %s \
--apiserver-endpoint '%s'`,
constraints.Cluster.Name,
containerRuntimeArg,
constraints.Cluster.Endpoint))
caBundle, err := p.GetCABundle(ctx)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/aws/subnets.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func (s *SubnetProvider) getFilters(ctx context.Context, constraints *v1alpha1.C
// Filter by zone
if constraints.Zones != nil {
filters = append(filters, &ec2.Filter{
Name: aws.String("availability-zone"),
Name: aws.String("availability-zone"),
Values: aws.StringSlice(constraints.Zones),
})
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/cloudprovider/aws/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,19 +317,19 @@ var _ = Describe("Allocation", func() {
})
})
Context("Defaulting", func() {
It("should default subnetSelector" ,func() {
It("should default subnetSelector", func() {
provisioner.SetDefaults(ctx)
constraints, err := v1alpha1.NewConstraints(&provisioner.Spec.Constraints)
Expect(err).ToNot(HaveOccurred())
Expect(constraints.SubnetSelector).To(Equal(map[string]string{"kubernetes.io/cluster/test-cluster": ""}))
})
It("should default securityGroupSelector" ,func() {
It("should default securityGroupSelector", func() {
provisioner.SetDefaults(ctx)
constraints, err := v1alpha1.NewConstraints(&provisioner.Spec.Constraints)
Expect(err).ToNot(HaveOccurred())
Expect(constraints.SecurityGroupsSelector).To(Equal(map[string]string{"kubernetes.io/cluster/test-cluster": ""}))
})
It("should default capacityType" ,func() {
It("should default capacityType", func() {
provisioner.SetDefaults(ctx)
constraints, err := v1alpha1.NewConstraints(&provisioner.Spec.Constraints)
Expect(err).ToNot(HaveOccurred())
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/allocation/scheduling/constraints.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
v1 "k8s.io/api/core/v1"
)

func NewConstraintsWithOverrides(constraints *v1alpha4.Constraints, pod *v1.Pod) (*v1alpha4.Constraints) {
func NewConstraintsWithOverrides(constraints *v1alpha4.Constraints, pod *v1.Pod) *v1alpha4.Constraints {
return &v1alpha4.Constraints{
Provider: constraints.Provider,
Labels: functional.UnionStringMaps(constraints.Labels, pod.Spec.NodeSelector),
Expand Down

0 comments on commit 903fffc

Please sign in to comment.