Skip to content

Commit d85dea4

Browse files
authored
Merge pull request #77 from arangodb/resilient-improvements
Added terminated-pod cleanup to speed up re-creation of pods.
2 parents 05fca79 + ba2141f commit d85dea4

File tree

7 files changed

+108
-1
lines changed

7 files changed

+108
-1
lines changed

pkg/deployment/context_impl.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,21 @@ func (d *Deployment) DeletePod(podName string) error {
141141
return nil
142142
}
143143

144+
// CleanupPod deletes a given pod with force and explicit UID.
145+
// If the pod does not exist, the error is ignored.
146+
func (d *Deployment) CleanupPod(p v1.Pod) error {
147+
log := d.deps.Log
148+
podName := p.GetName()
149+
ns := p.GetNamespace()
150+
options := metav1.NewDeleteOptions(0)
151+
options.Preconditions = metav1.NewUIDPreconditions(string(p.GetUID()))
152+
if err := d.deps.KubeCli.Core().Pods(ns).Delete(podName, options); err != nil && !k8sutil.IsNotFound(err) {
153+
log.Debug().Err(err).Str("pod", podName).Msg("Failed to cleanup pod")
154+
return maskAny(err)
155+
}
156+
return nil
157+
}
158+
144159
// DeletePvc deletes a persistent volume claim with given name in the namespace
145160
// of the deployment. If the pvc does not exist, the error is ignored.
146161
func (d *Deployment) DeletePvc(pvcName string) error {

pkg/deployment/deployment_inspector.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,12 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
8787
d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject))
8888
}
8989

90+
// At the end of the inspect, we cleanup terminated pods.
91+
if d.resources.CleanupTerminatedPods(); err != nil {
92+
hasError = true
93+
d.CreateEvent(k8sutil.NewErrorEvent("Pod cleanup failed", err, d.apiObject))
94+
}
95+
9096
// Update next interval (on errors)
9197
if hasError {
9298
if d.recentInspectionErrors == 0 {

pkg/deployment/resources/context.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,7 @@ type Context interface {
6262
CreateEvent(evt *v1.Event)
6363
// GetOwnedPods returns a list of all pods owned by the deployment.
6464
GetOwnedPods() ([]v1.Pod, error)
65+
// CleanupPod deletes a given pod with force and explicit UID.
66+
// If the pod does not exist, the error is ignored.
67+
CleanupPod(p v1.Pod) error
6568
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package resources
24+
25+
import (
26+
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
27+
28+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
29+
)
30+
31+
// CleanupTerminatedPods removes all pods in Terminated state that belong to a member in Created state.
32+
func (r *Resources) CleanupTerminatedPods() error {
33+
log := r.log
34+
35+
pods, err := r.context.GetOwnedPods()
36+
if err != nil {
37+
log.Debug().Err(err).Msg("Failed to get owned pods")
38+
return maskAny(err)
39+
}
40+
41+
// Update member status from all pods found
42+
status := r.context.GetStatus()
43+
for _, p := range pods {
44+
if k8sutil.IsArangoDBImageIDAndVersionPod(p) {
45+
// Image ID pods are not relevant to inspect here
46+
continue
47+
}
48+
49+
// Check pod state
50+
if !(k8sutil.IsPodSucceeded(&p) || k8sutil.IsPodFailed(&p)) {
51+
continue
52+
}
53+
54+
// Find member status
55+
memberStatus, _, found := status.Members.MemberStatusByPodName(p.GetName())
56+
if !found {
57+
log.Debug().Str("pod", p.GetName()).Msg("no memberstatus found for pod")
58+
continue
59+
}
60+
61+
// Check member termination condition
62+
if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
63+
continue
64+
}
65+
66+
// Ok, we can delete the pod
67+
log.Debug().Str("pod-name", p.GetName()).Msg("Cleanup terminated pod")
68+
if err := r.context.CleanupPod(p); err != nil {
69+
log.Warn().Err(err).Str("pod-name", p.GetName()).Msg("Failed to cleanup pod")
70+
}
71+
}
72+
return nil
73+
}

pkg/deployment/resources/pod_creator.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
369369
if err := k8sutil.CreateArangodPod(kubecli, spec.IsDevelopment(), apiObject, role, m.ID, m.PodName, m.PersistentVolumeClaimName, info.ImageID, spec.GetImagePullPolicy(), args, env, livenessProbe, readinessProbe, tlsKeyfileSecretName, rocksdbEncryptionSecretName); err != nil {
370370
return maskAny(err)
371371
}
372+
log.Debug().Str("pod-name", m.PodName).Msg("Created pod")
372373
} else if group.IsArangosync() {
373374
// Find image ID
374375
info, found := status.Images.GetByImage(spec.Sync.GetImage())
@@ -390,6 +391,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, group api.Server
390391
if err := k8sutil.CreateArangoSyncPod(kubecli, spec.IsDevelopment(), apiObject, role, m.ID, m.PodName, info.ImageID, spec.Sync.GetImagePullPolicy(), args, env, livenessProbe, affinityWithRole); err != nil {
391392
return maskAny(err)
392393
}
394+
log.Debug().Str("pod-name", m.PodName).Msg("Created pod")
393395
}
394396
// Record new member state
395397
m.State = newState

pkg/deployment/resources/pod_inspector.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,27 +70,30 @@ func (r *Resources) InspectPods() error {
7070
if k8sutil.IsPodSucceeded(&p) {
7171
// Pod has terminated with exit code 0.
7272
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
73+
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded")
7374
updateMemberStatusNeeded = true
7475
}
7576
} else if k8sutil.IsPodFailed(&p) {
7677
// Pod has terminated with at least 1 container with a non-zero exit code.
7778
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
79+
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Failed")
7880
updateMemberStatusNeeded = true
7981
}
8082
}
8183
if k8sutil.IsPodReady(&p) {
8284
// Pod is now ready
8385
if memberStatus.Conditions.Update(api.ConditionTypeReady, true, "Pod Ready", "") {
86+
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to true")
8487
updateMemberStatusNeeded = true
8588
}
8689
} else {
8790
// Pod is not ready
8891
if memberStatus.Conditions.Update(api.ConditionTypeReady, false, "Pod Not Ready", "") {
92+
log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to false")
8993
updateMemberStatusNeeded = true
9094
}
9195
}
9296
if updateMemberStatusNeeded {
93-
log.Debug().Str("pod-name", p.GetName()).Msg("Updated member status member for pod")
9497
if err := status.Members.UpdateMemberStatus(memberStatus, group); err != nil {
9598
return maskAny(err)
9699
}
@@ -123,6 +126,7 @@ func (r *Resources) InspectPods() error {
123126
}
124127
}
125128
default:
129+
log.Debug().Str("pod-name", podName).Msg("Pod is gone")
126130
m.State = api.MemberStateNone // This is trigger a recreate of the pod.
127131
// Create event
128132
events = append(events, k8sutil.NewPodGoneEvent(podName, group.AsRole(), apiObject))

tests/resilience_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ func TestResiliencePVC(t *testing.T) {
140140

141141
// Delete one pvc after the other
142142
apiObject.ForeachServerGroup(func(group api.ServerGroup, spec api.ServerGroupSpec, status *api.MemberStatusList) error {
143+
if group == api.ServerGroupCoordinators {
144+
// Coordinators have no PVC
145+
return nil
146+
}
143147
for _, m := range *status {
144148
// Get current pvc so we can compare UID later
145149
originalPVC, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{})

0 commit comments

Comments
 (0)