Skip to content

Commit

Permalink
fix: Workflow stuck at running when init container failed but wait co…
Browse files Browse the repository at this point in the history
…ntainer did not. Fixes #10717 (#10740)
  • Loading branch information
terrytangyuan committed Mar 29, 2023
1 parent a3d64b2 commit e715488
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 8 deletions.
19 changes: 11 additions & 8 deletions workflow/controller/operator.go
Expand Up @@ -1337,23 +1337,26 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus
new.Outputs.ExitCode = pointer.StringPtr(fmt.Sprint(*exitCode))
}

// We cannot fail the node until the wait container is finished because it may be busy saving outputs, and these
// would not get captured successfully.
for _, c := range pod.Status.ContainerStatuses {
if c.Name == common.WaitContainerName && c.State.Terminated == nil && new.Phase.Completed() {
woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ")
new.Phase = old.Phase
}
}
// If the init container failed, we should mark the node as failed.
var initContainerFailed bool
for _, c := range pod.Status.InitContainerStatuses {
if c.State.Terminated != nil && int(c.State.Terminated.ExitCode) != 0 {
new.Phase = wfv1.NodeFailed
initContainerFailed = true
woc.log.WithField("new.phase", new.Phase).Info("marking node as failed since init container has non-zero exit code")
break
}
}

// We cannot fail the node until the wait container is finished (unless any init container has failed) because it may be busy saving outputs, and these
// would not get captured successfully.
for _, c := range pod.Status.ContainerStatuses {
if (c.Name == common.WaitContainerName && c.State.Terminated == nil && new.Phase.Completed()) && !initContainerFailed {
woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ")
new.Phase = old.Phase
}
}

// if we are transitioning from Pending to a different state, clear out unchanged message
if old.Phase == wfv1.NodePending && new.Phase != wfv1.NodePending && old.Message == new.Message {
new.Message = ""
Expand Down
26 changes: 26 additions & 0 deletions workflow/controller/operator_test.go
Expand Up @@ -1357,6 +1357,32 @@ func TestAssessNodeStatus(t *testing.T) {
},
node: &wfv1.NodeStatus{TemplateName: templateName},
want: wfv1.NodeFailed,
}, {
name: "pod failed - init container failed but neither wait nor main containers are finished",
pod: &apiv1.Pod{
Status: apiv1.PodStatus{
InitContainerStatuses: []apiv1.ContainerStatus{
{
Name: common.InitContainerName,
State: apiv1.ContainerState{Terminated: &apiv1.ContainerStateTerminated{ExitCode: 1}},
},
},
ContainerStatuses: []apiv1.ContainerStatus{
{
Name: common.WaitContainerName,
State: apiv1.ContainerState{Terminated: nil},
},
{
Name: common.MainContainerName,
State: apiv1.ContainerState{Terminated: nil},
},
},
Message: "failed since init container failed",
Phase: apiv1.PodFailed,
},
},
node: &wfv1.NodeStatus{TemplateName: templateName},
want: wfv1.NodeFailed,
}, {
name: "pod running",
pod: &apiv1.Pod{
Expand Down

0 comments on commit e715488

Please sign in to comment.