Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge feature ecs-anywhere-gpu support to dev #3040

Merged
merged 7 commits into from Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 21 additions & 8 deletions agent/api/task/task.go
Expand Up @@ -458,8 +458,12 @@ func (task *Task) addGPUResource(cfg *config.Config) error {
container.GPUIDs = append(container.GPUIDs, association.Name)
}
}
task.populateGPUEnvironmentVariables()
task.NvidiaRuntime = cfg.NvidiaRuntime
// For external instances, GPU IDs are handled by resources struct
// For internal instances, GPU IDs are handled by env var
if !cfg.External.Enabled() {
task.populateGPUEnvironmentVariables()
task.NvidiaRuntime = cfg.NvidiaRuntime
}
}
return nil
}
Expand Down Expand Up @@ -1466,7 +1470,7 @@ func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerCont
return nil, &apierrors.HostConfigError{Msg: err.Error()}
}

resources := task.getDockerResources(container)
resources := task.getDockerResources(container, cfg)

// Populate hostConfig
hostConfig := &dockercontainer.HostConfig{
Expand Down Expand Up @@ -1531,11 +1535,13 @@ func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerCont
func (task *Task) overrideContainerRuntime(container *apicontainer.Container, hostCfg *dockercontainer.HostConfig,
cfg *config.Config) *apierrors.HostConfigError {
if task.isGPUEnabled() && task.shouldRequireNvidiaRuntime(container) {
if task.NvidiaRuntime == "" {
return &apierrors.HostConfigError{Msg: "Runtime is not set for GPU containers"}
if !cfg.External.Enabled() {
if task.NvidiaRuntime == "" {
return &apierrors.HostConfigError{Msg: "Runtime is not set for GPU containers"}
}
seelog.Debugf("Setting runtime as %s for container %s", task.NvidiaRuntime, container.Name)
hostCfg.Runtime = task.NvidiaRuntime
}
seelog.Debugf("Setting runtime as %s for container %s", task.NvidiaRuntime, container.Name)
hostCfg.Runtime = task.NvidiaRuntime
}

if cfg.InferentiaSupportEnabled && container.RequireNeuronRuntime() {
Expand All @@ -1546,7 +1552,7 @@ func (task *Task) overrideContainerRuntime(container *apicontainer.Container, ho
}

// Requires an *apicontainer.Container and returns the Resources for the HostConfig struct
func (task *Task) getDockerResources(container *apicontainer.Container) dockercontainer.Resources {
func (task *Task) getDockerResources(container *apicontainer.Container, cfg *config.Config) dockercontainer.Resources {
// Convert MB to B and set Memory
dockerMem := int64(container.Memory * 1024 * 1024)
if dockerMem != 0 && dockerMem < apicontainer.DockerContainerMinimumMemoryInBytes {
Expand All @@ -1560,6 +1566,13 @@ func (task *Task) getDockerResources(container *apicontainer.Container) dockerco
Memory: dockerMem,
CPUShares: cpuShare,
}
if cfg.External.Enabled() && cfg.GPUSupportEnabled {
deviceRequest := dockercontainer.DeviceRequest{
Capabilities: [][]string{[]string{"gpu"}},
DeviceIDs: container.GPUIDs,
}
resources.DeviceRequests = []dockercontainer.DeviceRequest{deviceRequest}
}
return resources
}

Expand Down
57 changes: 53 additions & 4 deletions agent/api/task/task_test.go
Expand Up @@ -568,7 +568,8 @@ func TestGetDockerResources(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")
}
Expand All @@ -586,7 +587,8 @@ func TestGetDockerResourcesCPUTooLow(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")

// Minimum requirement of 2 CPU Shares
Expand All @@ -608,7 +610,8 @@ func TestGetDockerResourcesMemoryTooLow(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(apicontainer.DockerContainerMinimumMemoryInBytes), resources.Memory,
"Wrong amount of memory")
Expand All @@ -626,11 +629,57 @@ func TestGetDockerResourcesUnspecifiedMemory(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(0), resources.Memory, "Wrong amount of memory")
}

func TestGetDockerResourcesExternalGPUInstance(t *testing.T) {
container := &apicontainer.Container{
Name: "c1",
CPU: uint(10),
Memory: uint(256),
GPUIDs: []string{"gpu1"},
}
testTask := &Task{
Arn: "arn:aws:ecs:us-east-1:012345678910:task/c09f0188-7f87-4b0f-bfc3-16296622b6fe",
Family: "myFamily",
Version: "1",
Containers: []*apicontainer.Container{container},
}
cfg := &config.Config{
GPUSupportEnabled: true,
}
cfg.External.Value = config.ExplicitlyEnabled
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")
assert.Equal(t, resources.DeviceRequests[0].DeviceIDs, container.GPUIDs, "Wrong GPU IDs assigned")
}

func TestGetDockerResourcesInternalGPUInstance(t *testing.T) {
container := &apicontainer.Container{
Name: "c1",
CPU: uint(10),
Memory: uint(256),
GPUIDs: []string{"gpu1"},
}
testTask := &Task{
Arn: "arn:aws:ecs:us-east-1:012345678910:task/c09f0188-7f87-4b0f-bfc3-16296622b6fe",
Family: "myFamily",
Version: "1",
Containers: []*apicontainer.Container{container},
}
cfg := &config.Config{
GPUSupportEnabled: true,
}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")
assert.Equal(t, int64(len(resources.DeviceRequests)), int64(0), "GPU IDs to be handled by env var for internal instance")
}

func TestPostUnmarshalTaskWithDockerVolumes(t *testing.T) {
autoprovision := true
ctrl := gomock.NewController(t)
Expand Down