-
Notifications
You must be signed in to change notification settings - Fork 126
/
pod_status.go
133 lines (107 loc) · 4 KB
/
pod_status.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
package util
import (
"fmt"
v1 "k8s.io/api/core/v1"
"github.com/armadaproject/armada/internal/common/util"
"github.com/armadaproject/armada/pkg/armadaevents"
)
var imagePullBackOffStatesSet = util.StringListToSet([]string{"ImagePullBackOff", "ErrImagePull"})
const (
oomKilledReason = "OOMKilled"
evictedReason = "Evicted"
deadlineExceeded = "DeadlineExceeded"
)
// TODO: Need to detect pod preemption. So that job failed events can include a string indicating a pod was preempted.
// We need this so that whatever system submitted the job knows the job was preempted.
func ExtractPodFailedReason(pod *v1.Pod) string {
if pod.Status.Message != "" {
return pod.Status.Message
}
containerStatuses := pod.Status.ContainerStatuses
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
failedMessage := ""
for _, containerStatus := range containerStatuses {
if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 {
terminatedState := containerStatus.State.Terminated
failedMessage += fmt.Sprintf(
"Container %s failed with exit code %d because %s: %s\n",
containerStatus.Name,
terminatedState.ExitCode,
terminatedState.Reason,
terminatedState.Message,
)
}
}
return failedMessage
}
func ExtractPodFailureCause(pod *v1.Pod) armadaevents.KubernetesReason {
if pod.Status.Reason == evictedReason {
return armadaevents.KubernetesReason_Evicted
}
if pod.Status.Reason == deadlineExceeded {
return armadaevents.KubernetesReason_DeadlineExceeded
}
containerStatuses := pod.Status.ContainerStatuses
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
for _, containerStatus := range containerStatuses {
if isOom(containerStatus) {
return armadaevents.KubernetesReason_OOM
}
}
return armadaevents.KubernetesReason_AppError
}
func ExtractPodExitCodes(pod *v1.Pod) map[string]int32 {
containerStatuses := pod.Status.ContainerStatuses
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
exitCodes := map[string]int32{}
for _, containerStatus := range containerStatuses {
if containerStatus.State.Terminated != nil {
exitCodes[containerStatus.Name] = containerStatus.State.Terminated.ExitCode
}
}
return exitCodes
}
func ExtractFailedPodContainerStatuses(pod *v1.Pod, clusterId string) []*armadaevents.ContainerError {
containerStatuses := pod.Status.ContainerStatuses
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
returnStatuses := make([]*armadaevents.ContainerError, 0, len(containerStatuses))
for _, containerStatus := range containerStatuses {
if containerStatus.State.Terminated == nil {
// This function is meant to be finding exit stauses of containers
// Skip non-finished containers
continue
}
containerInfo := &armadaevents.ContainerError{
ExitCode: containerStatus.State.Terminated.ExitCode,
Message: containerStatus.State.Terminated.Message,
Reason: containerStatus.State.Terminated.Reason,
ObjectMeta: &armadaevents.ObjectMeta{
ExecutorId: clusterId,
Namespace: pod.Namespace,
Name: containerStatus.Name,
KubernetesId: "", // only the id of the pod is stored in the failed message
},
}
containerInfo.KubernetesReason = armadaevents.KubernetesReason_AppError
if isOom(containerStatus) {
containerInfo.KubernetesReason = armadaevents.KubernetesReason_OOM
}
returnStatuses = append(returnStatuses, containerInfo)
}
return returnStatuses
}
func isOom(containerStatus v1.ContainerStatus) bool {
return containerStatus.State.Terminated != nil && containerStatus.State.Terminated.Reason == oomKilledReason
}
type PodStartupStatus int
func hasUnstableContainerStates(pod *v1.Pod) bool {
for _, containerStatus := range GetPodContainerStatuses(pod) {
if containerStatus.State.Waiting != nil {
waitingReason := containerStatus.State.Waiting.Reason
if imagePullBackOffStatesSet[waitingReason] {
return true
}
}
}
return false
}