Skip to content

Commit

Permalink
feat(controller): Estimate workflow duration. Fixes argoproj#2717
Browse files Browse the repository at this point in the history
  • Loading branch information
alexec committed Sep 21, 2020
1 parent ed59408 commit 30ffa3e
Show file tree
Hide file tree
Showing 50 changed files with 1,614 additions and 610 deletions.
10 changes: 10 additions & 0 deletions api/openapi-spec/swagger.json
Expand Up @@ -2952,6 +2952,11 @@
"description": "DisplayName is a human readable representation of the node. Unique within a template boundary",
"type": "string"
},
"estimatedDuration": {
"description": "EstimatedDuration in seconds.",
"type": "integer",
"format": "int32"
},
"finishedAt": {
"description": "Time at which this node completed",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
Expand Down Expand Up @@ -4506,6 +4511,11 @@
"$ref": "#/definitions/io.argoproj.workflow.v1alpha1.Condition"
}
},
"estimatedDuration": {
"description": "EstimatedDuration in seconds.",
"type": "integer",
"format": "int32"
},
"finishedAt": {
"description": "Time at which this workflow completed",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
Expand Down
6 changes: 5 additions & 1 deletion cmd/argo/commands/get.go
Expand Up @@ -141,10 +141,14 @@ func printWorkflowHelper(wf *wfv1.Workflow, getArgs getFlags) string {
if !wf.Status.StartedAt.IsZero() {
out += fmt.Sprintf(fmtStr, "Duration:", humanize.RelativeDuration(wf.Status.StartedAt.Time, wf.Status.FinishedAt.Time))
}
if wf.Status.Phase == wfv1.NodeRunning {
if wf.Status.EstimatedDuration > 0 {
out += fmt.Sprintf(fmtStr, "EstimatedDuration:", humanize.Duration(wf.Status.EstimatedDuration.ToDuration()))
}
}
if !wf.Status.ResourcesDuration.IsZero() {
out += fmt.Sprintf(fmtStr, "ResourcesDuration:", wf.Status.ResourcesDuration)
}

if len(wf.Spec.Arguments.Parameters) > 0 {
out += fmt.Sprintf(fmtStr, "Parameters:", "")
for _, param := range wf.Spec.Arguments.Parameters {
Expand Down
29 changes: 19 additions & 10 deletions cmd/argo/commands/get_test.go
Expand Up @@ -9,9 +9,9 @@ import (

"github.com/stretchr/testify/assert"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/yaml"

wfv1 "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1"
testutil "github.com/argoproj/argo/test/util"
)

func testPrintNodeImpl(t *testing.T, expected string, node wfv1.NodeStatus, nodePrefix string, getArgs getFlags) {
Expand Down Expand Up @@ -108,7 +108,20 @@ func TestStatusToNodeFieldSelector(t *testing.T) {
assert.Equal(t, "phase=Running", one)
}

var indexTest = `apiVersion: argoproj.io/v1alpha1
func Test_printWorkflowHelper(t *testing.T) {
t.Run("EstimatedDuration", func(t *testing.T) {
var wf wfv1.Workflow
testutil.MustUnmarshallYAML(`
status:
estimatedDuration: 1
phase: Running
`, &wf)
output := printWorkflowHelper(&wf, getFlags{})
assert.Regexp(t, `EstimatedDuration: *1 second`, output)
})
t.Run("IndexOrdering", func(t *testing.T) {
var wf wfv1.Workflow
testutil.MustUnmarshallYAML(`apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
creationTimestamp: "2020-06-02T16:04:21Z"
Expand Down Expand Up @@ -338,15 +351,11 @@ status:
type: Pod
phase: Succeeded
startedAt: "2020-06-02T16:04:21Z"
`

func TestIndexOrdering(t *testing.T) {
var wf wfv1.Workflow
err := yaml.Unmarshal([]byte(indexTest), &wf)
if assert.NoError(t, err) {
assert.Contains(t, printWorkflowHelper(&wf, getFlags{}), `
`, &wf)
output := printWorkflowHelper(&wf, getFlags{})
assert.Contains(t, output, `
├- sleep(9:nine) sleep many-items-z26lj-2619926859 19s
├- sleep(10:ten) sleep many-items-z26lj-1052882686 23s
├- sleep(11:eleven) sleep many-items-z26lj-3011405271 22s`)
}
})
}
20 changes: 20 additions & 0 deletions docs/estimated-duration.md
@@ -0,0 +1,20 @@
# Estimated Duration

![alpha](assets/alpha.svg)

> v2.12 and after
When you run a workflow, the controller will try to estimate its duration.

This is based on the most recently successful workflow submitted from the same workflow template, cluster workflow template or cron workflow.

To get this data, the controller queries the Kubernetes API first (as this is faster) and then [workflow archive](workflow-archive.md) (if enabled).

If you've used tools like Jenkins, you'll know that that estimates can be inaccurate:

* A pod spent a long amount of time pending scheduling.
* The workflow is non-deterministic, e.g. it uses `when` to execute different paths.
* The workflow can vary is scale, e.g. sometimes it uses `withItems` and so sometimes run 100 nodes, sometimes a 1000.
* If the pod runtimes are unpredictable.
* The workflow is parameterized, and different parameters affect its duration.

24 changes: 24 additions & 0 deletions docs/fields.md
Expand Up @@ -316,6 +316,8 @@ WorkflowTemplate is the definition of a workflow template resource

- [`cron-backfill.yaml`](https://github.com/argoproj/argo/blob/master/examples/cron-backfill.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`templates.yaml`](https://github.com/argoproj/argo/blob/master/examples/workflow-template/templates.yaml)
</details>

Expand Down Expand Up @@ -525,6 +527,8 @@ WorkflowSpec is the specification of a Workflow.

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down Expand Up @@ -659,13 +663,15 @@ WorkflowStatus contains overall status information about a workflow
|:----------:|:----------:|---------------|
|`compressedNodes`|`string`|Compressed and base64 decoded Nodes map|
|`conditions`|`Array<`[`Condition`](#condition)`>`|Conditions is a list of conditions the Workflow may have|
|`estimatedDuration`|`int32`|EstimatedDuration in seconds.|
|`finishedAt`|[`Time`](#time)|Time at which this workflow completed|
|`message`|`string`|A human readable message indicating details about why the workflow is in this condition.|
|`nodes`|[`NodeStatus`](#nodestatus)|Nodes is a mapping between a node ID and the node's status.|
|`offloadNodeStatusVersion`|`string`|Whether on not node status has been offloaded to a database. If exists, then Nodes and CompressedNodes will be empty. This will actually be populated with a hash of the offloaded data.|
|`outputs`|[`Outputs`](#outputs)|Outputs captures output values and artifact locations produced by the workflow via global outputs|
|`persistentVolumeClaims`|`Array<`[`Volume`](#volume)`>`|PersistentVolumeClaims tracks all PVCs that were created as part of the io.argoproj.workflow.v1alpha1. The contents of this list are drained at the end of the workflow.|
|`phase`|`string`|Phase a simple, high-level summary of where the workflow is in its lifecycle.|
|`progress`|`string`|Progress to completion|
|`resourcesDuration`|`Map< integer , int64 >`|ResourcesDuration is the total for the workflow|
|`startedAt`|[`Time`](#time)|Time at which this workflow started|
|`storedTemplates`|[`Template`](#template)|StoredTemplates is a mapping between a template ref and the node's status.|
Expand Down Expand Up @@ -870,6 +876,8 @@ CronWorkflowSpec is the specification of a CronWorkflow

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down Expand Up @@ -1177,6 +1185,8 @@ WorkflowTemplateSpec is a spec of WorkflowTemplate.

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down Expand Up @@ -1739,6 +1749,8 @@ Template is a reusable and composable unit of execution in a workflow

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down Expand Up @@ -1936,6 +1948,7 @@ NodeStatus contains status information about an individual node in the workflow
|`children`|`Array< string >`|Children is a list of child node IDs|
|`daemoned`|`boolean`|Daemoned tracks whether or not this node was daemoned and need to be terminated|
|`displayName`|`string`|DisplayName is a human readable representation of the node. Unique within a template boundary|
|`estimatedDuration`|`int32`|EstimatedDuration in seconds.|
|`finishedAt`|[`Time`](#time)|Time at which this node completed|
|`hostNodeName`|`string`|HostNodeName name of the Kubernetes node on which the Pod is running, if applicable|
|`id`|`string`|ID is a unique identifier of a node within the worklow It is implemented as a hash of the node name, which makes the ID deterministic|
Expand All @@ -1947,6 +1960,7 @@ NodeStatus contains status information about an individual node in the workflow
|`outputs`|[`Outputs`](#outputs)|Outputs captures output parameter values and artifact locations produced by this template invocation|
|`phase`|`string`|Phase a simple, high-level summary of where the node is in its lifecycle. Can be used as a state machine.|
|`podIP`|`string`|PodIP captures the IP of the pod for daemoned steps|
|`progress`|`string`|Progress to completion|
|`resourcesDuration`|`Map< integer , int64 >`|ResourcesDuration is indicative, but not accurate, resource duration. This is populated when the nodes completes.|
|`startedAt`|[`Time`](#time)|Time at which this node started|
|~`storedTemplateID`~|~`string`~|~StoredTemplateID is the ID of stored template.~ DEPRECATED: This value is not used anymore.|
Expand Down Expand Up @@ -2825,6 +2839,8 @@ Pod metdata

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down Expand Up @@ -4116,6 +4132,8 @@ ObjectMeta is metadata that all persisted resources must have, which includes al

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down Expand Up @@ -4628,6 +4646,8 @@ A single application container that you want to run within a pod.

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`resubmit.yaml`](https://github.com/argoproj/argo/blob/master/examples/resubmit.yaml)

- [`retry-backoff.yaml`](https://github.com/argoproj/argo/blob/master/examples/retry-backoff.yaml)
Expand Down Expand Up @@ -5284,6 +5304,8 @@ PersistentVolumeClaimSpec describes the common attributes of storage devices and

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down Expand Up @@ -6299,6 +6321,8 @@ ListMeta describes metadata that synthetic resources must have, including lists

- [`pod-spec-yaml-patch.yaml`](https://github.com/argoproj/argo/blob/master/examples/pod-spec-yaml-patch.yaml)

- [`progress-workflowtemplate.yaml`](https://github.com/argoproj/argo/blob/master/examples/progress-workflowtemplate.yaml)

- [`recursive-for-loop.yaml`](https://github.com/argoproj/argo/blob/master/examples/recursive-for-loop.yaml)

- [`resource-delete-with-flags.yaml`](https://github.com/argoproj/argo/blob/master/examples/resource-delete-with-flags.yaml)
Expand Down
4 changes: 4 additions & 0 deletions docs/swagger.md
Expand Up @@ -1234,6 +1234,7 @@ NodeStatus contains status information about an individual node in the workflow
| children | [ string ] | Children is a list of child node IDs | No |
| daemoned | boolean | Daemoned tracks whether or not this node was daemoned and need to be terminated | No |
| displayName | string | DisplayName is a human readable representation of the node. Unique within a template boundary | No |
| estimatedDuration | integer | EstimatedDuration in seconds. | No |
| finishedAt | [io.k8s.apimachinery.pkg.apis.meta.v1.Time](#io.k8s.apimachinery.pkg.apis.meta.v1.time) | Time at which this node completed | No |
| hostNodeName | string | HostNodeName name of the Kubernetes node on which the Pod is running, if applicable | No |
| id | string | ID is a unique identifier of a node within the worklow It is implemented as a hash of the node name, which makes the ID deterministic | Yes |
Expand All @@ -1245,6 +1246,7 @@ NodeStatus contains status information about an individual node in the workflow
| outputs | [io.argoproj.workflow.v1alpha1.Outputs](#io.argoproj.workflow.v1alpha1.outputs) | Outputs captures output parameter values and artifact locations produced by this template invocation | No |
| phase | string | Phase a simple, high-level summary of where the node is in its lifecycle. Can be used as a state machine. | No |
| podIP | string | PodIP captures the IP of the pod for daemoned steps | No |
| progress | string | Progress to completion | No |
| resourcesDuration | object | ResourcesDuration is indicative, but not accurate, resource duration. This is populated when the nodes completes. | No |
| startedAt | [io.k8s.apimachinery.pkg.apis.meta.v1.Time](#io.k8s.apimachinery.pkg.apis.meta.v1.time) | Time at which this node started | No |
| storedTemplateID | string | StoredTemplateID is the ID of stored template. DEPRECATED: This value is not used anymore. | No |
Expand Down Expand Up @@ -1792,13 +1794,15 @@ WorkflowStatus contains overall status information about a workflow
| ---- | ---- | ----------- | -------- |
| compressedNodes | string | Compressed and base64 decoded Nodes map | No |
| conditions | [ [io.argoproj.workflow.v1alpha1.Condition](#io.argoproj.workflow.v1alpha1.condition) ] | Conditions is a list of conditions the Workflow may have | No |
| estimatedDuration | integer | EstimatedDuration in seconds. | No |
| finishedAt | [io.k8s.apimachinery.pkg.apis.meta.v1.Time](#io.k8s.apimachinery.pkg.apis.meta.v1.time) | Time at which this workflow completed | No |
| message | string | A human readable message indicating details about why the workflow is in this condition. | No |
| nodes | object | Nodes is a mapping between a node ID and the node's status. | No |
| offloadNodeStatusVersion | string | Whether on not node status has been offloaded to a database. If exists, then Nodes and CompressedNodes will be empty. This will actually be populated with a hash of the offloaded data. | No |
| outputs | [io.argoproj.workflow.v1alpha1.Outputs](#io.argoproj.workflow.v1alpha1.outputs) | Outputs captures output values and artifact locations produced by the workflow via global outputs | No |
| persistentVolumeClaims | [ [io.k8s.api.core.v1.Volume](#io.k8s.api.core.v1.volume) ] | PersistentVolumeClaims tracks all PVCs that were created as part of the io.argoproj.workflow.v1alpha1. The contents of this list are drained at the end of the workflow. | No |
| phase | string | Phase a simple, high-level summary of where the workflow is in its lifecycle. | No |
| progress | string | Progress to completion | No |
| resourcesDuration | object | ResourcesDuration is the total for the workflow | No |
| startedAt | [io.k8s.apimachinery.pkg.apis.meta.v1.Time](#io.k8s.apimachinery.pkg.apis.meta.v1.time) | Time at which this workflow started | No |
| storedTemplates | object | StoredTemplates is a mapping between a template ref and the node's status. | No |
Expand Down
4 changes: 4 additions & 0 deletions manifests/base/crds/full/argoproj.io_workflows.yaml
Expand Up @@ -6591,6 +6591,8 @@ spec:
type: string
type: object
type: array
estimatedDuration:
type: integer
finishedAt:
format: date-time
type: string
Expand All @@ -6609,6 +6611,8 @@ spec:
type: boolean
displayName:
type: string
estimatedDuration:
type: integer
finishedAt:
format: date-time
type: string
Expand Down
2 changes: 2 additions & 0 deletions mkdocs.yml
Expand Up @@ -39,6 +39,8 @@ nav:
- enhanced-depends-logic.md
- artifact-repository-ref.md
- resource-duration.md
- estimated-duration.md
- progress.md
- workflow-creator.md
- synchronization.md
- workflow-of-workflows.md
Expand Down
1 change: 1 addition & 0 deletions persist/sqldb/workflow_archive.go
Expand Up @@ -46,6 +46,7 @@ type archivedWorkflowLabelRecord struct {

type WorkflowArchive interface {
ArchiveWorkflow(wf *wfv1.Workflow) error
// list workflows, with the most recently started workflows at the beginning (i.e. index 0 is the most recent)
ListWorkflows(namespace string, minStartAt, maxStartAt time.Time, labelRequirements labels.Requirements, limit, offset int) (wfv1.Workflows, error)
GetWorkflow(uid string) (*wfv1.Workflow, error)
DeleteWorkflow(uid string) error
Expand Down
14 changes: 14 additions & 0 deletions pkg/apis/workflow/v1alpha1/estimated_duration.go
@@ -0,0 +1,14 @@
package v1alpha1

import "time"

// EstimatedDuration is in seconds.
type EstimatedDuration int

func (d EstimatedDuration) ToDuration() time.Duration {
return time.Second * time.Duration(d)
}

func NewEstimatedDuration(d time.Duration) EstimatedDuration {
return EstimatedDuration(d.Seconds())
}
14 changes: 14 additions & 0 deletions pkg/apis/workflow/v1alpha1/estimated_duration_test.go
@@ -0,0 +1,14 @@
package v1alpha1

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
)

func TestEstimatedDuration(t *testing.T) {
duration := NewEstimatedDuration(time.Minute)
assert.Equal(t, EstimatedDuration(60), duration)
assert.Equal(t, time.Duration(time.Minute), duration.ToDuration())
}

0 comments on commit 30ffa3e

Please sign in to comment.