Skip to content

Commit

Permalink
[cps-2.8] fix: deleting pod stuck on Terminating status (#1323)
Browse files Browse the repository at this point in the history
* fix: deleting pod stuck on Terminating status

WorkflowRun GC SHOULD Wait all workflowRun related workload pods
deleting completed, then start gc pod to clean date on PV.Otherwise,
if the path which is used by workload pods in the PV is deleted
before workload pods deletion, the pod deletion process will get
stuck on Terminating status.

* fix(gc): deleting pod stuck on Terminating status

If there are pods not finished and this is not the last gc try
process, cyclone will not start gc pod to clean data on PV
  • Loading branch information
caicloud-bot committed Nov 12, 2019
1 parent c100f35 commit ff77bfa
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 1 deletion.
17 changes: 17 additions & 0 deletions pkg/meta/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ func (pk PodKind) String() string {
const (
// PodKindGC represents the pod is used for GC purpose.
PodKindGC PodKind = "gc"
// PodKindWorkload represents the pod is used to run a stage workload.
PodKindWorkload PodKind = "workload"
)

// ProjectSelector is a selector for cyclone CRD resources which have corresponding project label
Expand Down Expand Up @@ -186,6 +188,21 @@ func WorkflowRunSelector() string {
return fmt.Sprintf("%s=%s", LabelControllerInstance, instance)
}

// WorkflowRunPodSelector selects pods that belongs to a WorkflowRun.
func WorkflowRunPodSelector(wfr string) string {
return fmt.Sprintf("%s=%s", LabelWorkflowRunName, wfr)
}

// WorkloadPodSelector selects pods that used to execute workload.
func WorkloadPodSelector() string {
return fmt.Sprintf("%s=%s", LabelPodKind, PodKindWorkload.String())
}

// WorkflowRunWorkloadPodSelector selects pods that used to execute a WorkflowRun's workload.
func WorkflowRunWorkloadPodSelector(wfr string) string {
return fmt.Sprintf("%s,%s", WorkflowRunPodSelector(wfr), WorkloadPodSelector())
}

// LabelExistsSelector returns a label selector to query resources with label key exists.
func LabelExistsSelector(key string) string {
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
Expand Down
42 changes: 41 additions & 1 deletion pkg/workflow/workflowrun/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"os"
"reflect"
"sync"
"time"

log "github.com/sirupsen/logrus"
Expand Down Expand Up @@ -407,6 +408,8 @@ func (o *operator) Reconcile() error {
// - 'wfrDeletion' indicates whether the GC is performed because of WorkflowRun deleted. In this case,
// GC would performed silently, without event recording and status updating.
func (o *operator) GC(lastTry, wfrDeletion bool) error {
wg := sync.WaitGroup{}
allPodsFinished := true
// For each pod created, delete it.
for stg, status := range o.wfr.Status.Stages {
// For non-terminated stage, update status to cancelled.
Expand Down Expand Up @@ -441,10 +444,47 @@ func (o *operator) GC(lastTry, wfrDeletion bool) error {
o.recorder.Eventf(o.wfr, corev1.EventTypeWarning, "GC", "Delete pod '%s' error: %v", status.Pod.Name, err)
}
} else {
log.WithField("ns", status.Pod.Namespace).WithField("pod", status.Pod.Name).Info("Pod deleted")
log.WithField("ns", status.Pod.Namespace).WithField("pod", status.Pod.Name).Info("Start to delete pod")

wg.Add(1)
go func(namespace, podName string) {
defer wg.Done()

timeout := time.After(5 * time.Minute)
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-timeout:
allPodsFinished = false
log.WithField("ns", namespace).WithField("pod", podName).Warn("Pod deletion timeout")
return
case <-ticker.C:
_, err := o.clusterClient.CoreV1().Pods(namespace).Get(podName, metav1.GetOptions{})
if err != nil && errors.IsNotFound(err) {
log.WithField("ns", namespace).WithField("pod", podName).Info("Pod deleted")
return
}
}
}
}(status.Pod.Namespace, status.Pod.Name)
}
}

// Wait all workflowRun related workload pods deleting completed, then start gc pod to clean data on PV.
// Otherwise, if the path which is used by workload pods in the PV is deleted before workload pods deletion,
// the pod deletion process will get stuck on Terminating status.
wg.Wait()

// If there are pods not finished and this is not the last gc try process, we will not start gc pod to clean
// data on PV. The last gc try process will ensure data could be cleaned.
if !allPodsFinished && !lastTry {
if !wfrDeletion {
o.recorder.Eventf(o.wfr, corev1.EventTypeWarning, "GC", "There are stage pods not Finished")
}
return nil
}

// Get execution context of the WorkflowRun, namespace and PVC are defined in the context.
executionContext := GetExecutionContext(o.wfr)

Expand Down
1 change: 1 addition & 0 deletions pkg/workflow/workload/pod/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ func (m *Builder) Prepare() error {
Labels: map[string]string{
meta.LabelWorkflowRunName: m.wfr.Name,
meta.LabelPodCreatedBy: meta.CycloneCreator,
meta.LabelPodKind: meta.PodKindWorkload.String(),
},
Annotations: map[string]string{
meta.AnnotationIstioInject: meta.AnnotationValueFalse,
Expand Down

0 comments on commit ff77bfa

Please sign in to comment.