Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Nodes with pods deleted out-of-band should be Errored, not Failed #2855

Merged
merged 7 commits into from May 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 3 additions & 4 deletions test/e2e/functional_test.go
Expand Up @@ -163,11 +163,11 @@ func (s *FunctionalSuite) TestFastFailOnPodTermination() {
WaitForWorkflow(120 * time.Second).
Then().
ExpectWorkflow(func(t *testing.T, _ *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Equal(t, wfv1.NodeFailed, status.Phase)
assert.Equal(t, wfv1.NodeError, status.Phase)
assert.Len(t, status.Nodes, 4)
nodeStatus := status.Nodes.FindByDisplayName("sleep")
assert.Equal(t, wfv1.NodeFailed, nodeStatus.Phase)
assert.Equal(t, "pod termination", nodeStatus.Message)
assert.Equal(t, wfv1.NodeError, nodeStatus.Phase)
assert.Equal(t, "pod deleted during operation", nodeStatus.Message)
})
}

Expand Down Expand Up @@ -397,7 +397,6 @@ func (s *FunctionalSuite) TestGlobalScope() {
}

func (s *FunctionalSuite) TestStopBehavior() {
s.T().SkipNow()
s.Given().
Workflow("@functional/stop-terminate.yaml").
When().
Expand Down
6 changes: 4 additions & 2 deletions workflow/controller/exec_control.go
Expand Up @@ -74,8 +74,10 @@ func (woc *wfOperationCtx) applyExecutionControl(pod *apiv1.Pod, wfNodesLock *sy

var newDeadline *time.Time
if woc.wf.Spec.Shutdown != "" {
// Signal termination by setting a Zero deadline
newDeadline = &time.Time{}
_, onExitPod := pod.Labels[common.LabelKeyOnExit]
if woc.wf.Spec.Shutdown == wfv1.ShutdownStrategyTerminate || (woc.wf.Spec.Shutdown == wfv1.ShutdownStrategyStop && !onExitPod) {
newDeadline = &time.Time{}
}
Comment on lines +77 to +80
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This particular code fixes #2914

} else {
if podExecCtl.Deadline == nil && woc.workflowDeadline == nil {
return nil
Expand Down
4 changes: 2 additions & 2 deletions workflow/controller/operator.go
Expand Up @@ -957,8 +957,8 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, node *wfv1.NodeStatu
case apiv1.PodRunning:
if pod.DeletionTimestamp != nil {
// pod is being terminated
newPhase = wfv1.NodeFailed
message = "pod termination"
newPhase = wfv1.NodeError
message = "pod deleted during operation"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this needs to be reverted as well since you will be reverting the logic of deletion of running pods..

Copy link
Member Author

@simster7 simster7 May 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is unrelated. This makes sure that Nodes that have pods deleted out-of-band get marked as Error instead of Failed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's simply a node label change

} else {
newPhase = wfv1.NodeRunning
tmplStr, ok := pod.Annotations[common.AnnotationKeyTemplate]
Expand Down
4 changes: 2 additions & 2 deletions workflow/controller/operator_test.go
Expand Up @@ -606,15 +606,15 @@ func TestAssessNodeStatus(t *testing.T) {
node: &wfv1.NodeStatus{},
want: wfv1.NodeFailed,
}, {
name: "pod termination",
name: "pod deleted during operation",
pod: &apiv1.Pod{
ObjectMeta: metav1.ObjectMeta{DeletionTimestamp: &metav1.Time{Time: time.Now()}},
Status: apiv1.PodStatus{
Phase: apiv1.PodRunning,
},
},
node: &wfv1.NodeStatus{},
want: wfv1.NodeFailed,
want: wfv1.NodeError,
}, {
name: "pod running",
pod: &apiv1.Pod{
Expand Down