Skip to content

Commit

Permalink
config, handlers: Increase steady state and burst throttles for task …
Browse files Browse the repository at this point in the history
…endpoints

This fixes #1231
  • Loading branch information
sharanyad committed Feb 19, 2018
1 parent f85ad79 commit 91dc8fa
Show file tree
Hide file tree
Showing 12 changed files with 152 additions and 25 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## 1.17.1-dev
* Bug - Fixed a bug that was causing a runtime panic by accessing negative index in the health check log slice. [#1239](https://github.com/aws/amazon-ecs-agent/pull/1239)
* Bug - Fixed a bug where steady state throttle limits for task metadata endpoints
were too low for applications [#1240](https://github.com/aws/amazon-ecs-agent/pull/1240)

## 1.17.0
* Feature - Support a HTTP endpoint for `awsvpc` tasks to query metadata
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ additional details on each available environment variable.
| `ECS_HOST_DATA_DIR` | `/var/lib/ecs` | The source directory on the host from which ECS_DATADIR is mounted. We use this to determine the source mount path for container metadata files in the case the ECS Agent is running as a container. We do not use this value in Windows because the ECS Agent is not running as container in Windows. | `/var/lib/ecs` | `Not used` |
| `ECS_ENABLE_TASK_CPU_MEM_LIMIT` | `true` | Whether to enable task-level cpu and memory limits | `true` | `false` |
| `ECS_CGROUP_PATH` | `/sys/fs/cgroup` | The root cgroup path that is expected by the ECS agent. This is the path that accessible from the agent mount. | `/sys/fs/cgroup` | Not applicable |
| `ECS_TASK_METADATA_RPS_LIMIT` | `100,150` | Comma separated values for steady state and burst throttle limits for task metadata endpoint | `40,60` | `40,60` |


### Persistence

Expand Down
38 changes: 38 additions & 0 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ const (

// pauseContainerTarball is the path to the pause container tarball
pauseContainerTarballPath = "/images/amazon-ecs-pause.tar"

// DefaultTaskMetadataSteadyStateRate is set as 40. This is arrived from our benchmarking
// results where task endpoint can handle 4000 rps effectively. Here, 100 containers
// will be able to send out 40 rps.
DefaultTaskMetadataSteadyStateRate = 40

// DefaultTaskMetadataBurstRate is set to handle 60 burst requests at once
DefaultTaskMetadataBurstRate = 60
)

var (
Expand Down Expand Up @@ -346,6 +354,7 @@ func environmentConfig() (Config, error) {
}
containerMetadataEnabled := utils.ParseBool(os.Getenv("ECS_ENABLE_CONTAINER_METADATA"), false)
dataDirOnHost := os.Getenv("ECS_HOST_DATA_DIR")
steadyStateRate, burstRate := getTaskMetadataThrottles()

if len(errs) > 0 {
err = utils.NewMultiError(errs...)
Expand Down Expand Up @@ -391,6 +400,8 @@ func environmentConfig() (Config, error) {
DataDirOnHost: dataDirOnHost,
OverrideAWSLogsExecutionRole: overrideAWSLogsExecutionRoleEnabled,
CgroupPath: cgroupPath,
TaskMetadataSteadyStateRate: steadyStateRate,
TaskMetadataBurstRate: burstRate,
}, err
}

Expand Down Expand Up @@ -434,6 +445,27 @@ func getTaskCPUMemLimitEnabled() Conditional {
return taskCPUMemLimitEnabled
}

func getTaskMetadataThrottles() (int, int) {
var steadyStateRate, burstRate int
rpsLimitEnvVal := os.Getenv("ECS_TASK_METADATA_RPS_LIMIT")
rpsLimitSplits := strings.Split(rpsLimitEnvVal, ",")
if len(rpsLimitSplits) != 2 {
seelog.Warn(`Invalid format for "ECS_TASK_METADATA_RPS_LIMIT", expected: "rateLimit,burst"`)
return steadyStateRate, burstRate
}
steadyStateRate, err := strconv.Atoi(strings.TrimSpace(rpsLimitSplits[0]))
if err != nil {
seelog.Warnf(`Invalid format for "ECS_TASK_METADATA_RPS_LIMIT", expected integer for steady state rate: %v`, err)
return 0, 0
}
burstRate, err = strconv.Atoi(strings.TrimSpace(rpsLimitSplits[1]))
if err != nil {
seelog.Warnf(`Invalid format for "ECS_TASK_METADATA_RPS_LIMIT", expected integer for burst rate: %v`, err)
return 0, 0
}
return steadyStateRate, burstRate
}

func parseEnvVariableUint16(envVar string) uint16 {
envVal := os.Getenv(envVar)
var var16 uint16
Expand Down Expand Up @@ -557,6 +589,12 @@ func (cfg *Config) validateAndOverrideBounds() error {
cfg.NumImagesToDeletePerCycle = DefaultNumImagesToDeletePerCycle
}

if cfg.TaskMetadataSteadyStateRate <= 0 || cfg.TaskMetadataBurstRate <= 0 {
seelog.Warnf("Invalid values for rate limits, will be overridden with default values: %d,%d.", DefaultTaskMetadataSteadyStateRate, DefaultTaskMetadataBurstRate)
cfg.TaskMetadataSteadyStateRate = DefaultTaskMetadataSteadyStateRate
cfg.TaskMetadataBurstRate = DefaultTaskMetadataBurstRate
}

cfg.platformOverrides()

return nil
Expand Down
71 changes: 71 additions & 0 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func TestEnvironmentConfig(t *testing.T) {
defer setTestEnv("ECS_NUM_IMAGES_DELETE_PER_CYCLE", "2")()
defer setTestEnv("ECS_INSTANCE_ATTRIBUTES", "{\"my_attribute\": \"testing\"}")()
defer setTestEnv("ECS_ENABLE_TASK_ENI", "true")()
defer setTestEnv("ECS_TASK_METADATA_RPS_LIMIT", "1000,1100")()
additionalLocalRoutesJSON := `["1.2.3.4/22","5.6.7.8/32"]`
setTestEnv("ECS_AWSVPC_ADDITIONAL_LOCAL_ROUTES", additionalLocalRoutesJSON)
setTestEnv("ECS_ENABLE_CONTAINER_METADATA", "true")
Expand Down Expand Up @@ -115,6 +116,8 @@ func TestEnvironmentConfig(t *testing.T) {
assert.Equal(t, additionalLocalRoutesJSON, string(serializedAdditionalLocalRoutesJSON))
assert.Equal(t, "/etc/ecs/", conf.DataDirOnHost, "Wrong value for DataDirOnHost")
assert.True(t, conf.ContainerMetadataEnabled, "Wrong value for ContainerMetadataEnabled")
assert.Equal(t, 1000, conf.TaskMetadataSteadyStateRate)
assert.Equal(t, 1100, conf.TaskMetadataBurstRate)
}

func TestTrimWhitespaceWhenCreating(t *testing.T) {
Expand Down Expand Up @@ -356,6 +359,74 @@ func TestAWSLogsExecutionRole(t *testing.T) {
assert.True(t, conf.OverrideAWSLogsExecutionRole)
}

func TestTaskMetadataRPSLimits(t *testing.T) {
testCases := []struct {
name string
envVarVal string
expectedSteadyStateRate int
expectedBurstRate int
}{
{
name: "negative limit values",
envVarVal: "-10,-10",
expectedSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
expectedBurstRate: DefaultTaskMetadataBurstRate,
},
{
name: "negative limit,valid burst",
envVarVal: "-10,10",
expectedSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
expectedBurstRate: DefaultTaskMetadataBurstRate,
},
{
name: "missing limit,valid burst",
envVarVal: " ,10",
expectedSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
expectedBurstRate: DefaultTaskMetadataBurstRate,
},
{
name: "valid limit,missing burst",
envVarVal: "10,",
expectedSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
expectedBurstRate: DefaultTaskMetadataBurstRate,
},
{
name: "empty variable",
envVarVal: "",
expectedSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
expectedBurstRate: DefaultTaskMetadataBurstRate,
},
{
name: "missing burst",
envVarVal: "10",
expectedSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
expectedBurstRate: DefaultTaskMetadataBurstRate,
},
{
name: "more than expected values",
envVarVal: "10,10,10",
expectedSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
expectedBurstRate: DefaultTaskMetadataBurstRate,
},
{
name: "values with spaces",
envVarVal: " 10 ,5 ",
expectedSteadyStateRate: 10,
expectedBurstRate: 5,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
defer setTestEnv("ECS_TASK_METADATA_RPS_LIMIT", tc.envVarVal)()
defer setTestRegion()()
cfg, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, tc.expectedSteadyStateRate, cfg.TaskMetadataSteadyStateRate)
assert.Equal(t, tc.expectedBurstRate, cfg.TaskMetadataBurstRate)
})
}
}

func setTestRegion() func() {
return setTestEnv("AWS_DEFAULT_REGION", "us-west-2")
}
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ func DefaultConfig() Config {
ContainerMetadataEnabled: false,
TaskCPUMemLimit: DefaultEnabled,
CgroupPath: defaultCgroupPath,
TaskMetadataSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
TaskMetadataBurstRate: DefaultTaskMetadataBurstRate,
}
}

Expand Down
4 changes: 4 additions & 0 deletions agent/config/config_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, defaultCNIPluginsPath, cfg.CNIPluginsPath, "CNIPluginsPath default is set incorrectly")
assert.False(t, cfg.AWSVPCBlockInstanceMetdata, "AWSVPCBlockInstanceMetdata default is incorrectly set")
assert.Equal(t, "/var/lib/ecs", cfg.DataDirOnHost, "Default DataDirOnHost set incorrectly")
assert.Equal(t, DefaultTaskMetadataSteadyStateRate, cfg.TaskMetadataSteadyStateRate,
"Default TaskMetadataSteadyStateRate is set incorrectly")
assert.Equal(t, DefaultTaskMetadataBurstRate, cfg.TaskMetadataBurstRate,
"Default TaskMetadataBurstRate is set incorrectly")
}

// TestConfigFromFile tests the configuration can be read from file
Expand Down
2 changes: 2 additions & 0 deletions agent/config/config_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ func DefaultConfig() Config {
NumImagesToDeletePerCycle: DefaultNumImagesToDeletePerCycle,
ContainerMetadataEnabled: false,
TaskCPUMemLimit: ExplicitlyDisabled,
TaskMetadataSteadyStateRate: DefaultTaskMetadataSteadyStateRate,
TaskMetadataBurstRate: DefaultTaskMetadataBurstRate,
}
}

Expand Down
4 changes: 4 additions & 0 deletions agent/config/config_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ func TestConfigDefault(t *testing.T) {
assert.Equal(t, DefaultImageCleanupTimeInterval, cfg.ImageCleanupInterval, "ImageCleanupInterval default is set incorrectly")
assert.Equal(t, DefaultNumImagesToDeletePerCycle, cfg.NumImagesToDeletePerCycle, "NumImagesToDeletePerCycle default is set incorrectly")
assert.Equal(t, `C:\ProgramData\Amazon\ECS\data`, cfg.DataDirOnHost, "Default DataDirOnHost set incorrectly")
assert.Equal(t, DefaultTaskMetadataSteadyStateRate, cfg.TaskMetadataSteadyStateRate,
"Default TaskMetadataSteadyStateRate is set incorrectly")
assert.Equal(t, DefaultTaskMetadataBurstRate, cfg.TaskMetadataBurstRate,
"Default TaskMetadataBurstRate is set incorrectly")
}

func TestConfigIAMTaskRolesReserves80(t *testing.T) {
Expand Down
6 changes: 6 additions & 0 deletions agent/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,10 @@ type Config struct {
// CgroupPath is the path expected by the agent, defaults to
// '/sys/fs/cgroup'
CgroupPath string

// TaskMetadataSteadyStateRate specifies the steady state throttle for the task metadata endpoint
TaskMetadataSteadyStateRate int

// TaskMetadataBurstRate specifies the burst rate throttle for the task metadata endpoint
TaskMetadataBurstRate int
}
26 changes: 7 additions & 19 deletions agent/handlers/taskmetadata/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,6 @@ const (
// Credentials API versions
apiVersion1 = 1
apiVersion2 = 2

// Rate limits for the metadata endpoint(s) and APIs by request ip addresses. Today, we
// support the following endpoints:
// 1. /v2/credentials: For serving IAM role credentials
// 2. /v2/metadata: For serving task and container metadata information (for "awsvpc" tasks)
// 3. /v2/stats: For serving task and container stats information (for "awsvpc" tasks)

// rateLimitPerSecond specifies the steady state throttle for the task metadata endpoint.
// Because all containers in a task share the same IP address in an "awsvpc" task, and a
// task can be constituted of up to 10 containers, the steady state rate is set at 10
// per second
rateLimitPerSecond = 10

// rateLimitBurstPerSecond specifies the burst rate throttle for the task metadata endpoint.
rateLimitBurstPerSecond = 15
)

// ServeHTTP serves IAM Role Credentials for Tasks being managed by the agent.
Expand All @@ -74,7 +59,8 @@ func ServeHTTP(credentialsManager credentials.Manager,

auditLogger := audit.NewAuditLog(containerInstanceArn, cfg, logger)

server := setupServer(credentialsManager, auditLogger, state, cfg.Cluster, statsEngine)
server := setupServer(credentialsManager, auditLogger, state, cfg.Cluster, statsEngine,
cfg.TaskMetadataSteadyStateRate, cfg.TaskMetadataBurstRate)

for {
utils.RetryWithBackoff(utils.NewSimpleBackoff(time.Second, time.Minute, 0.2, 2), func() error {
Expand All @@ -93,7 +79,9 @@ func setupServer(credentialsManager credentials.Manager,
auditLogger audit.AuditLogger,
state dockerstate.TaskEngineState,
cluster string,
statsEngine stats.Engine) *http.Server {
statsEngine stats.Engine,
steadyStateRate int,
burstRate int) *http.Server {
serverMux := http.NewServeMux()
// Credentials handlers
serverMux.HandleFunc(credentials.V1CredentialsPath,
Expand All @@ -109,8 +97,8 @@ func setupServer(credentialsManager credentials.Manager,
serverMux.HandleFunc(statsPath+"/", statsV2Handler(state, statsEngine))
serverMux.HandleFunc(statsPath, statsV2Handler(state, statsEngine))

limiter := tollbooth.NewLimiter(rateLimitPerSecond, nil)
limiter.SetBurst(rateLimitBurstPerSecond)
limiter := tollbooth.NewLimiter(int64(steadyStateRate), nil)
limiter.SetBurst(burstRate)
// Log all requests and then pass through to serverMux
loggingServeMux := http.NewServeMux()
loggingServeMux.Handle("/", tollbooth.LimitHandler(
Expand Down
7 changes: 5 additions & 2 deletions agent/handlers/taskmetadata/handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"net/url"
"testing"

"github.com/aws/amazon-ecs-agent/agent/config"
"github.com/aws/amazon-ecs-agent/agent/credentials"
mock_credentials "github.com/aws/amazon-ecs-agent/agent/credentials/mocks"
mock_audit "github.com/aws/amazon-ecs-agent/agent/logger/audit/mocks"
Expand Down Expand Up @@ -174,7 +175,8 @@ func testErrorResponsesFromServer(t *testing.T, path string, expectedErrorMessag

credentialsManager := mock_credentials.NewMockManager(ctrl)
auditLog := mock_audit.NewMockAuditLogger(ctrl)
server := setupServer(credentialsManager, auditLog, nil, "", nil)
server := setupServer(credentialsManager, auditLog, nil, "", nil, config.DefaultTaskMetadataSteadyStateRate,
config.DefaultTaskMetadataBurstRate)

recorder := httptest.NewRecorder()
req, _ := http.NewRequest("GET", path, nil)
Expand Down Expand Up @@ -207,7 +209,8 @@ func getResponseForCredentialsRequest(t *testing.T, expectedStatus int,
defer ctrl.Finish()
credentialsManager := mock_credentials.NewMockManager(ctrl)
auditLog := mock_audit.NewMockAuditLogger(ctrl)
server := setupServer(credentialsManager, auditLog, nil, "", nil)
server := setupServer(credentialsManager, auditLog, nil, "", nil, config.DefaultTaskMetadataSteadyStateRate,
config.DefaultTaskMetadataBurstRate)
recorder := httptest.NewRecorder()

creds, ok := getCredentials()
Expand Down
13 changes: 9 additions & 4 deletions agent/handlers/taskmetadata/taskinfo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"time"

"github.com/aws/amazon-ecs-agent/agent/api"
"github.com/aws/amazon-ecs-agent/agent/config"
"github.com/aws/amazon-ecs-agent/agent/containermetadata"
"github.com/aws/amazon-ecs-agent/agent/credentials"
"github.com/aws/amazon-ecs-agent/agent/engine/dockerstate/mocks"
Expand Down Expand Up @@ -165,7 +166,8 @@ func TestTaskMetadata(t *testing.T) {
state.EXPECT().TaskByArn(taskARN).Return(task, true),
state.EXPECT().ContainerMapByArn(taskARN).Return(containerNameToDockerContainer, true),
)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine,
config.DefaultTaskMetadataSteadyStateRate, config.DefaultTaskMetadataBurstRate)
recorder := httptest.NewRecorder()
req, _ := http.NewRequest("GET", metadataPath, nil)
req.RemoteAddr = remoteIP + ":" + remotePort
Expand All @@ -192,7 +194,8 @@ func TestContainerMetadata(t *testing.T) {
state.EXPECT().ContainerByID(containerID).Return(dockerContainer, true),
state.EXPECT().TaskByID(containerID).Return(task, true),
)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine,
config.DefaultTaskMetadataSteadyStateRate, config.DefaultTaskMetadataBurstRate)
recorder := httptest.NewRecorder()
req, _ := http.NewRequest("GET", metadataPath+"/"+containerID, nil)
req.RemoteAddr = remoteIP + ":" + remotePort
Expand All @@ -219,7 +222,8 @@ func TestContainerStats(t *testing.T) {
state.EXPECT().GetTaskByIPAddress(remoteIP).Return(taskARN, true),
statsEngine.EXPECT().ContainerDockerStats(taskARN, containerID).Return(dockerStats, nil),
)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine,
config.DefaultTaskMetadataSteadyStateRate, config.DefaultTaskMetadataBurstRate)
recorder := httptest.NewRecorder()
req, _ := http.NewRequest("GET", statsPath+"/"+containerID, nil)
req.RemoteAddr = remoteIP + ":" + remotePort
Expand Down Expand Up @@ -252,7 +256,8 @@ func TestTaskStats(t *testing.T) {
state.EXPECT().ContainerMapByArn(taskARN).Return(containerMap, true),
statsEngine.EXPECT().ContainerDockerStats(taskARN, containerID).Return(dockerStats, nil),
)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine)
server := setupServer(credentials.NewManager(), auditLog, state, clusterName, statsEngine,
config.DefaultTaskMetadataSteadyStateRate, config.DefaultTaskMetadataBurstRate)
recorder := httptest.NewRecorder()
req, _ := http.NewRequest("GET", statsPath, nil)
req.RemoteAddr = remoteIP + ":" + remotePort
Expand Down

0 comments on commit 91dc8fa

Please sign in to comment.