Skip to content

Commit

Permalink
Retry DescribeInstance in the rare cases that PrivateDNSName eventual…
Browse files Browse the repository at this point in the history
… consistency is too slow (#564)
  • Loading branch information
ellistarn committed Jul 28, 2021
1 parent cdf88f2 commit 9583ef3
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 76 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.16

require (
github.com/Pallinder/go-randomdata v1.2.0
github.com/avast/retry-go v2.7.0+incompatible
github.com/aws/aws-sdk-go v1.38.69
github.com/deckarep/golang-set v1.7.1
github.com/go-logr/zapr v0.4.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmV
github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
github.com/aryann/difflib v0.0.0-20170710044230-e206f873d14a/go.mod h1:DAHtR1m6lCRdSC2Tm3DSWRPvIPr6xNKyeHdqDQSQT+A=
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/avast/retry-go v2.7.0+incompatible h1:XaGnzl7gESAideSjr+I8Hki/JBi+Yb9baHlMRPeSC84=
github.com/avast/retry-go v2.7.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU=
github.com/aws/aws-sdk-go v1.23.20/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
github.com/aws/aws-sdk-go v1.27.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
Expand Down
6 changes: 1 addition & 5 deletions pkg/cloudprovider/aws/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import (
"github.com/aws/aws-sdk-go/service/ssm"
"github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha3"
"github.com/awslabs/karpenter/pkg/cloudprovider"
"github.com/awslabs/karpenter/pkg/cloudprovider/aws/utils"
"github.com/awslabs/karpenter/pkg/utils/parallel"
"github.com/awslabs/karpenter/pkg/utils/project"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -76,10 +75,7 @@ type CloudProvider struct {
}

func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider {
sess := withUserAgent(session.Must(
session.NewSession(request.WithRetryer(
&aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint},
utils.NewRetryer()))))
sess := withUserAgent(session.Must(session.NewSession(&aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint})))
if *sess.Config.Region == "" {
logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service")
*sess.Config.Region = getRegionFromIMDS(sess)
Expand Down
61 changes: 40 additions & 21 deletions pkg/cloudprovider/aws/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ package aws
import (
"context"
"fmt"
"time"

"github.com/avast/retry-go"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
Expand All @@ -35,32 +38,21 @@ type NodeFactory struct {

// For a given set of instanceIDs return a map of instanceID to Kubernetes node object.
func (n *NodeFactory) For(ctx context.Context, instanceId *string) (*v1.Node, error) {
describeInstancesOutput, err := n.ec2api.DescribeInstancesWithContext(ctx, &ec2.DescribeInstancesInput{InstanceIds: []*string{instanceId}})
if aerr, ok := err.(awserr.Error); ok {
return nil, aerr
}
if err != nil {
return nil, fmt.Errorf("failed to describe ec2 instances, %w", err)
}
if len(describeInstancesOutput.Reservations) != 1 {
return nil, fmt.Errorf("expected a single instance reservation, got %d", len(describeInstancesOutput.Reservations))
instance := ec2.Instance{}
// EC2 is eventually consistent, so backoff-retry until we have the data we need.
if err := retry.Do(
func() (err error) { return n.getInstance(ctx, instanceId, &instance) },
retry.Delay(1 * time.Second),
retry.Attempts(3),
); err != nil {
return nil, err
}
if len(describeInstancesOutput.Reservations[0].Instances) != 1 {
return nil, fmt.Errorf("expected a single instance, got %d", len(describeInstancesOutput.Reservations[0].Instances))
}
instance := *describeInstancesOutput.Reservations[0].Instances[0]
logging.FromContext(ctx).Infof("Launched instance: %s, type: %s, zone: %s, hostname: %s",
*instance.InstanceId,
*instance.InstanceType,
*instance.Placement.AvailabilityZone,
*instance.PrivateDnsName,
)
return &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: *instance.PrivateDnsName,
Name: aws.StringValue(instance.PrivateDnsName),
},
Spec: v1.NodeSpec{
ProviderID: fmt.Sprintf("aws:///%s/%s", *instance.Placement.AvailabilityZone, *instance.InstanceId),
ProviderID: fmt.Sprintf("aws:///%s/%s", aws.StringValue(instance.Placement.AvailabilityZone), aws.StringValue(instance.InstanceId)),
},
Status: v1.NodeStatus{
Allocatable: v1.ResourceList{
Expand All @@ -76,3 +68,30 @@ func (n *NodeFactory) For(ctx context.Context, instanceId *string) (*v1.Node, er
},
}, nil
}

func (n *NodeFactory) getInstance(ctx context.Context, instanceId *string, instance *ec2.Instance) error {
describeInstancesOutput, err := n.ec2api.DescribeInstancesWithContext(ctx, &ec2.DescribeInstancesInput{InstanceIds: []*string{instanceId}})
if aerr, ok := err.(awserr.Error); ok && aerr.Code() == "InvalidInstanceID.NotFound" {
return aerr
}
if err != nil {
return fmt.Errorf("failed to describe ec2 instances, %w", err)
}
if len(describeInstancesOutput.Reservations) != 1 {
return fmt.Errorf("expected a single instance reservation, got %d", len(describeInstancesOutput.Reservations))
}
if len(describeInstancesOutput.Reservations[0].Instances) != 1 {
return fmt.Errorf("expected a single instance, got %d", len(describeInstancesOutput.Reservations[0].Instances))
}
*instance = *describeInstancesOutput.Reservations[0].Instances[0]
if len(aws.StringValue(instance.PrivateDnsName)) == 0 {
return fmt.Errorf("expected PrivateDnsName to be set")
}
logging.FromContext(ctx).Infof("Launched instance: %s, type: %s, zone: %s, hostname: %s",
aws.StringValue(instance.InstanceId),
aws.StringValue(instance.InstanceType),
aws.StringValue(instance.Placement.AvailabilityZone),
aws.StringValue(instance.PrivateDnsName),
)
return nil
}
50 changes: 0 additions & 50 deletions pkg/cloudprovider/aws/utils/retryer.go

This file was deleted.

0 comments on commit 9583ef3

Please sign in to comment.