Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[YUNIKORN-521] Placeholder pods are not cleaned when the job is deleted #232

Merged
merged 5 commits into from
Mar 16, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions pkg/appmgmt/general/general.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@
package general

import (
"reflect"

"go.uber.org/zap"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
k8sCache "k8s.io/client-go/tools/cache"
Expand Down Expand Up @@ -123,20 +126,40 @@ func (os *Manager) getAppMetadata(pod *v1.Pod) (interfaces.ApplicationMetadata,
if err != nil {
log.Logger().Error("unable to get taskGroups by given pod", zap.Error(err))
}
ownerReferences := getOwnerReferences(pod)

placeholderTimeout, err := utils.GetPlaceholderTimeoutParam(pod)
if err != nil {
log.Logger().Warn("unable to get placeholder timeout by given pod.", zap.Error(err))
}
return interfaces.ApplicationMetadata{
ApplicationID: appId,
QueueName: utils.GetQueueNameFromPod(pod),
User: user,
Tags: tags,
TaskGroups: taskGroups,
ApplicationID: appId,
QueueName: utils.GetQueueNameFromPod(pod),
User: user,
Tags: tags,
TaskGroups: taskGroups,
PlaceholderTimeoutInSec: placeholderTimeout,
OwnerReferences: ownerReferences,
}, true
}

func getOwnerReferences(pod *v1.Pod) []metav1.OwnerReference {
yangwwei marked this conversation as resolved.
Show resolved Hide resolved
if len(pod.OwnerReferences) > 0 {
return pod.OwnerReferences
}
controller := false
blockOwnerDeletion := true
ref := metav1.OwnerReference{
APIVersion: v1.SchemeGroupVersion.String(),
Kind: reflect.TypeOf(v1.Pod{}).Name(),
Name: pod.Name,
UID: pod.UID,
Controller: &controller,
BlockOwnerDeletion: &blockOwnerDeletion,
}
return []metav1.OwnerReference{ref}
}

// filter pods by scheduler name and state
func (os *Manager) filterPods(obj interface{}) bool {
switch obj.(type) {
Expand Down
28 changes: 28 additions & 0 deletions pkg/appmgmt/general/general_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,3 +498,31 @@ func TestGetExistingAllocation(t *testing.T) {
assert.Equal(t, alloc.UUID, string(pod.UID))
assert.Equal(t, alloc.NodeID, "allocated-node")
}

func TestGetOwnerReferences(t *testing.T) {
ownerRef := apis.OwnerReference{
APIVersion: apis.SchemeGroupVersion.String(),
Name: "owner ref",
}
podWithOwnerRef := &v1.Pod{
ObjectMeta: apis.ObjectMeta{
OwnerReferences: []apis.OwnerReference{ownerRef},
},
}
podWithNoOwnerRef := &v1.Pod{
ObjectMeta: apis.ObjectMeta{
Name: "pod",
UID: "uid",
},
}
returnedOwnerRefs := getOwnerReferences(podWithOwnerRef)
assert.Assert(t, len(returnedOwnerRefs) == 1, "Only one owner reference is expected")
assert.DeepEqual(t, ownerRef, returnedOwnerRefs[0])

returnedOwnerRefs = getOwnerReferences(podWithNoOwnerRef)
assert.Assert(t, len(returnedOwnerRefs) == 1, "Only one owner reference is expected")
assert.Equal(t, returnedOwnerRefs[0].Name, podWithNoOwnerRef.Name, "Unexpected owner reference name")
assert.Equal(t, returnedOwnerRefs[0].UID, podWithNoOwnerRef.UID, "Unexpected owner reference UID")
assert.Equal(t, returnedOwnerRefs[0].Kind, "Pod", "Unexpected owner reference Kind")
assert.Equal(t, returnedOwnerRefs[0].APIVersion, v1.SchemeGroupVersion.String(), "Unexpected owner reference Kind")
}
12 changes: 7 additions & 5 deletions pkg/appmgmt/interfaces/amprotocol.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package interfaces

import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/apache/incubator-yunikorn-k8shim/pkg/apis/yunikorn.apache.org/v1alpha1"
)
Expand Down Expand Up @@ -69,12 +70,13 @@ type AddTaskRequest struct {
}

type ApplicationMetadata struct {
ApplicationID string
QueueName string
User string
Tags map[string]string
TaskGroups []v1alpha1.TaskGroup
ApplicationID string
QueueName string
User string
Tags map[string]string
TaskGroups []v1alpha1.TaskGroup
PlaceholderTimeoutInSec int64
OwnerReferences []metav1.OwnerReference
}

type TaskMetadata struct {
Expand Down
34 changes: 21 additions & 13 deletions pkg/cache/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/looplab/fsm"
"go.uber.org/zap"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/apache/incubator-yunikorn-core/pkg/api"
"github.com/apache/incubator-yunikorn-k8shim/pkg/apis/yunikorn.apache.org/v1alpha1"
Expand All @@ -41,19 +42,20 @@ import (
)

type Application struct {
applicationID string
queue string
partition string
user string
taskMap map[string]*Task
tags map[string]string
schedulingPolicy v1alpha1.SchedulingPolicy
taskGroups []v1alpha1.TaskGroup
sm *fsm.FSM
lock *sync.RWMutex
schedulerAPI api.SchedulerAPI
placeholderAsk *si.Resource // total placeholder request for the app (all task groups)
placeholderTimeoutInSec int64
applicationID string
queue string
partition string
user string
taskMap map[string]*Task
tags map[string]string
schedulingPolicy v1alpha1.SchedulingPolicy
taskGroups []v1alpha1.TaskGroup
placeholderOwnerReferences []metav1.OwnerReference
sm *fsm.FSM
lock *sync.RWMutex
schedulerAPI api.SchedulerAPI
placeholderAsk *si.Resource // total placeholder request for the app (all task groups)
placeholderTimeoutInSec int64
}

func (app *Application) String() string {
Expand Down Expand Up @@ -225,6 +227,12 @@ func (app *Application) getTaskGroups() []v1alpha1.TaskGroup {
return app.taskGroups
}

func (app *Application) setOwnReferences(ref []metav1.OwnerReference) {
app.lock.RLock()
defer app.lock.RUnlock()
app.placeholderOwnerReferences = ref
}

func (app *Application) addTask(task *Task) {
app.lock.Lock()
defer app.lock.Unlock()
Expand Down
1 change: 1 addition & 0 deletions pkg/cache/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ func (ctx *Context) AddApplication(request *interfaces.AddApplicationRequest) in
ctx.apiProvider.GetAPIs().SchedulerAPI)
app.setTaskGroups(request.Metadata.TaskGroups)
app.SetPlaceholderTimeout(request.Metadata.PlaceholderTimeoutInSec)
app.setOwnReferences(request.Metadata.OwnerReferences)

// add into cache
ctx.applications[app.applicationID] = app
Expand Down
6 changes: 6 additions & 0 deletions pkg/cache/placeholder.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ type Placeholder struct {
}

func newPlaceholder(placeholderName string, app *Application, taskGroup v1alpha1.TaskGroup) *Placeholder {
ownerRefs := app.placeholderOwnerReferences
controller := false
for _, r := range ownerRefs {
*r.Controller = controller
}
yangwwei marked this conversation as resolved.
Show resolved Hide resolved
placeholderPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: placeholderName,
Expand All @@ -58,6 +63,7 @@ func newPlaceholder(placeholderName string, app *Application, taskGroup v1alpha1
constants.AnnotationPlaceholderFlag: "true",
constants.AnnotationTaskGroupName: taskGroup.Name,
}),
OwnerReferences: ownerRefs,
},
Spec: v1.PodSpec{
SecurityContext: &v1.PodSecurityContext{
Expand Down
99 changes: 63 additions & 36 deletions pkg/cache/placeholder_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,69 @@ import (
"github.com/apache/incubator-yunikorn-k8shim/pkg/common/constants"
)

const (
appID = "app01"
queue = "root.default"
namespace = "test"
)

func TestCreateAppPlaceholders(t *testing.T) {
const (
appID = "app01"
queue = "root.default"
namespace = "test"
)
app := createAppWIthTaskGroupForTest()
mockedAPIProvider := client.NewMockedAPIProvider()
createdPods := createAndCheckPlaceholderCreate(mockedAPIProvider, app, t)
for _, pod := range createdPods {
assert.Assert(t, len(pod.OwnerReferences) == 0, "By default the pod should not have owner reference set")
}

// simulate placeholder creation failures
// failed to create one placeholder
mockedAPIProvider.MockCreateFn(func(pod *v1.Pod) (*v1.Pod, error) {
if pod.Name == "tg-test-group-2-app01-15" {
return nil, fmt.Errorf("failed to create pod %s", pod.Name)
}
return pod, nil
})
err := placeholderMgr.createAppPlaceholders(app)
assert.Error(t, err, "failed to create pod tg-test-group-2-app01-15")
}

func createAndCheckPlaceholderCreate(mockedAPIProvider *client.MockedAPIProvider, app *Application, t *testing.T) map[string]*v1.Pod {
createdPods := make(map[string]*v1.Pod)
mockedAPIProvider.MockCreateFn(func(pod *v1.Pod) (*v1.Pod, error) {
createdPods[pod.Name] = pod
return pod, nil
})
placeholderMgr = &PlaceholderManager{
clients: mockedAPIProvider.GetAPIs(),
RWMutex: sync.RWMutex{},
}

err := placeholderMgr.createAppPlaceholders(app)
assert.NilError(t, err, "create app placeholders should be successful")
assert.Equal(t, len(createdPods), 30)
return createdPods
}

func TestCreateAppPlaceholdersWithOwnReference(t *testing.T) {
app := createAppWIthTaskGroupForTest()
controller := true
ownRef := apis.OwnerReference{
Name: "JobId",
UID: "JobUid",
Controller: &controller,
}
app.setOwnReferences([]apis.OwnerReference{ownRef})
mockedAPIProvider := client.NewMockedAPIProvider()
pods := createAndCheckPlaceholderCreate(mockedAPIProvider, app, t)
for _, pod := range pods {
assert.Assert(t, len(pod.OwnerReferences) == 1, "The pod should have exactly one owner reference set")
assert.Assert(t, *pod.OwnerReferences[0].Controller == false, "The owner reference should not be a controller")
assert.Equal(t, pod.OwnerReferences[0].Name, ownRef.Name, "The owner reference name does not match")
assert.Equal(t, pod.OwnerReferences[0].UID, ownRef.UID, "The owner reference UID does not match")
}
}

func createAppWIthTaskGroupForTest() *Application {
mockedSchedulerAPI := newMockSchedulerAPI()
app := NewApplication(appID, queue,
"bob", map[string]string{constants.AppTagNamespace: namespace}, mockedSchedulerAPI)
Expand All @@ -63,40 +120,10 @@ func TestCreateAppPlaceholders(t *testing.T) {
},
},
})

createdPods := make(map[string]*v1.Pod)
mockedAPIProvider := client.NewMockedAPIProvider()
mockedAPIProvider.MockCreateFn(func(pod *v1.Pod) (*v1.Pod, error) {
createdPods[pod.Name] = pod
return pod, nil
})
placeholderMgr := &PlaceholderManager{
clients: mockedAPIProvider.GetAPIs(),
RWMutex: sync.RWMutex{},
}

err := placeholderMgr.createAppPlaceholders(app)
assert.NilError(t, err, "create app placeholders should be successful")
assert.Equal(t, len(createdPods), 30)

// simulate placeholder creation failures
// failed to create one placeholder
mockedAPIProvider.MockCreateFn(func(pod *v1.Pod) (*v1.Pod, error) {
if pod.Name == "tg-test-group-2-app01-15" {
return nil, fmt.Errorf("failed to create pod %s", pod.Name)
}
return pod, nil
})
err = placeholderMgr.createAppPlaceholders(app)
assert.Error(t, err, "failed to create pod tg-test-group-2-app01-15")
return app
}

func TestCleanUp(t *testing.T) {
const (
appID = "app01"
queue = "root.default"
namespace = "test"
)
mockedContext := initContextForTest()
mockedSchedulerAPI := newMockSchedulerAPI()
app := NewApplication(appID, queue,
Expand Down