-
Notifications
You must be signed in to change notification settings - Fork 260
/
health_pod.go
133 lines (124 loc) · 4.44 KB
/
health_pod.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
package health
import (
"fmt"
"strings"
"github.com/argoproj/gitops-engine/pkg/utils/kube"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubectl/pkg/util/podutils"
)
func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
gvk := obj.GroupVersionKind()
switch gvk {
case corev1.SchemeGroupVersion.WithKind(kube.PodKind):
var pod corev1.Pod
err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &pod)
if err != nil {
return nil, fmt.Errorf("failed to convert unstructured Pod to typed: %v", err)
}
return getCorev1PodHealth(&pod)
default:
return nil, fmt.Errorf("unsupported Pod GVK: %s", gvk)
}
}
func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
// completed.
if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
var status HealthStatusCode
var messages []string
for _, containerStatus := range pod.Status.ContainerStatuses {
waiting := containerStatus.State.Waiting
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
status = HealthStatusDegraded
messages = append(messages, waiting.Message)
}
}
if status != "" {
return &HealthStatus{
Status: status,
Message: strings.Join(messages, ", "),
}, nil
}
}
getFailMessage := func(ctr *corev1.ContainerStatus) string {
if ctr.State.Terminated != nil {
if ctr.State.Terminated.Message != "" {
return ctr.State.Terminated.Message
}
if ctr.State.Terminated.Reason == "OOMKilled" {
return ctr.State.Terminated.Reason
}
if ctr.State.Terminated.ExitCode != 0 {
return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
}
}
return ""
}
switch pod.Status.Phase {
case corev1.PodPending:
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}, nil
case corev1.PodSucceeded:
return &HealthStatus{
Status: HealthStatusHealthy,
Message: pod.Status.Message,
}, nil
case corev1.PodFailed:
if pod.Status.Message != "" {
// Pod has a nice error message. Use that.
return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}, nil
}
for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
if msg := getFailMessage(&ctr); msg != "" {
return &HealthStatus{Status: HealthStatusDegraded, Message: msg}, nil
}
}
return &HealthStatus{Status: HealthStatusDegraded, Message: ""}, nil
case corev1.PodRunning:
switch pod.Spec.RestartPolicy {
case corev1.RestartPolicyAlways:
// if pod is ready, it is automatically healthy
if podutils.IsPodReady(pod) {
return &HealthStatus{
Status: HealthStatusHealthy,
Message: pod.Status.Message,
}, nil
}
// if it's not ready, check to see if any container terminated, if so, it's degraded
for _, ctrStatus := range pod.Status.ContainerStatuses {
if ctrStatus.LastTerminationState.Terminated != nil {
return &HealthStatus{
Status: HealthStatusDegraded,
Message: pod.Status.Message,
}, nil
}
}
// otherwise we are progressing towards a ready state
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}, nil
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
// pods set with a restart policy of OnFailure or Never, have a finite life.
// These pods are typically resource hooks. Thus, we consider these as Progressing
// instead of healthy.
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}, nil
}
}
return &HealthStatus{
Status: HealthStatusUnknown,
Message: pod.Status.Message,
}, nil
}