-
Notifications
You must be signed in to change notification settings - Fork 3.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(controller): Enhanced TTL controller scalability #4736
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,8 +10,7 @@ You cannot horizontally scale the controller. | |
|
||
You can scale the controller vertically: | ||
|
||
- If you have workflows with many steps, increase `--pod-workers`. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't recommend changing |
||
- If you have many workflows, increase `--workflow-workers`. | ||
- If you have many workflows, increase `--workflow-workers` and `--workflow-ttl-workers`. | ||
- Increase both `--qps` and `--burst`. | ||
|
||
You will need to increase the controller's memory and CPU. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,8 @@ import ( | |
|
||
const ( | ||
workflowTTLResyncPeriod = 20 * time.Minute | ||
// 1s is usually enough time for the informer to get synced and be up-to-date | ||
enoughTimeForInformerSync = time.Second | ||
) | ||
|
||
type ConfigSupplier func() *config.Config | ||
|
@@ -49,28 +51,29 @@ func NewController(wfClientset wfclientset.Interface, wfInformer cache.SharedInd | |
wfInformer.AddEventHandler(cache.FilteringResourceEventHandler{ | ||
FilterFunc: func(obj interface{}) bool { | ||
un, ok := obj.(*unstructured.Unstructured) | ||
return ok && un.GetLabels()[common.LabelKeyCompleted] == "true" && un.GetLabels()[common.LabelKeyWorkflowArchivingStatus] != "Pending" | ||
return ok && un.GetDeletionTimestamp() == nil && un.GetLabels()[common.LabelKeyCompleted] == "true" && un.GetLabels()[common.LabelKeyWorkflowArchivingStatus] != "Pending" | ||
}, | ||
Handler: cache.ResourceEventHandlerFuncs{ | ||
AddFunc: controller.enqueueWF, | ||
UpdateFunc: func(old, new interface{}) { | ||
controller.enqueueWF(new) | ||
}, | ||
DeleteFunc: controller.enqueueWF, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is invoked after the WF is deleted - so it is not needed - removing reduces the number of API requests by 1 |
||
}, | ||
}) | ||
return controller | ||
} | ||
|
||
func (c *Controller) Run(stopCh <-chan struct{}) error { | ||
func (c *Controller) Run(stopCh <-chan struct{}, workflowTTLWorkers int) error { | ||
defer runtimeutil.HandleCrash() | ||
defer c.workqueue.ShutDown() | ||
log.Infof("Starting workflow TTL controller (resync %v)", c.resyncPeriod) | ||
log.Infof("Starting workflow TTL controller (resync %v, workflowTTLWorkers %d)", c.resyncPeriod, workflowTTLWorkers) | ||
go c.wfInformer.Run(stopCh) | ||
if ok := cache.WaitForCacheSync(stopCh, c.wfInformer.HasSynced); !ok { | ||
return fmt.Errorf("failed to wait for caches to sync") | ||
} | ||
go wait.Until(c.runWorker, time.Second, stopCh) | ||
for i := 0; i < workflowTTLWorkers; i++ { | ||
go wait.Until(c.runWorker, time.Second, stopCh) | ||
} | ||
log.Info("Started workflow TTL worker") | ||
<-stopCh | ||
log.Info("Shutting workflow TTL worker") | ||
|
@@ -129,9 +132,6 @@ func (c *Controller) enqueueWF(obj interface{}) { | |
log.Warnf("'%v' is not an unstructured", obj) | ||
return | ||
} | ||
if un.GetDeletionTimestamp() != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. redundant now it is checked by |
||
return | ||
} | ||
wf, err := util.FromUnstructured(un) | ||
if err != nil { | ||
log.Warnf("Failed to unmarshal workflow %v object: %v", obj, err) | ||
|
@@ -152,7 +152,12 @@ func (c *Controller) enqueueWF(obj interface{}) { | |
runtimeutil.HandleError(err) | ||
return | ||
} | ||
//c.workqueue.Add(key) | ||
// if we try and delete in the next second, it is almost certain that the informer is out of sync. Because we | ||
// double-check that sees if the workflow in the informer is already deleted and we'll make 2 API requests when | ||
// one is enough. | ||
if addAfter < enoughTimeForInformerSync { | ||
addAfter = enoughTimeForInformerSync | ||
} | ||
log.Infof("Queueing workflow %s/%s for delete in %v", wf.Namespace, wf.Name, addAfter) | ||
c.workqueue.AddAfter(key, addAfter) | ||
} | ||
|
@@ -177,6 +182,9 @@ func (c *Controller) deleteWorkflow(key string) error { | |
log.Warnf("Key '%s' in index is not an unstructured", key) | ||
return nil | ||
} | ||
if un.GetDeletionTimestamp() != nil { | ||
return nil | ||
} | ||
wf, err := util.FromUnstructured(un) | ||
if err != nil { | ||
log.Warnf("Failed to unmarshal key '%s' to workflow object: %v", key, err) | ||
|
@@ -187,9 +195,14 @@ func (c *Controller) deleteWorkflow(key string) error { | |
|
||
err = c.wfclientset.ArgoprojV1alpha1().Workflows(wf.Namespace).Delete(wf.Name, &metav1.DeleteOptions{PropagationPolicy: commonutil.GetDeletePropagation()}) | ||
if err != nil { | ||
return err | ||
if apierr.IsNotFound(err) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've not seen this error apart from initial testing - prevented by line 155 I think, but a good bonus check |
||
log.Infof("workflow already deleted '%s'", key) | ||
} else { | ||
return err | ||
} | ||
} else { | ||
log.Infof("Successfully deleted '%s'", key) | ||
} | ||
log.Infof("Successfully deleted '%s'", key) | ||
} | ||
return nil | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for adding this. This was my proposal during the prod incident. But We don't want to make too many changes