From abeaa41b86d415c64417a894146315ab4daef065 Mon Sep 17 00:00:00 2001 From: Adrian Aneci Date: Fri, 16 Dec 2022 22:30:05 +0200 Subject: [PATCH] Fix metrics and handle pods baked by custom controllers --- go.mod | 2 +- go.sum | 2 + internal/testing/e2e_test.go | 56 ++++++++++++++++++++++++- internal/testing/kind.yaml | 3 ++ internal/testing/prometheus_stuffs.yaml | 2 + pkg/handler/handler.go | 25 ++++++++--- pkg/metrics/metrics.go | 6 +-- 7 files changed, 84 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index c62253e..b4dbceb 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/go-co-op/gocron v1.16.2 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.13.0 + github.com/prometheus/common v0.37.0 github.com/sirupsen/logrus v1.9.0 github.com/spf13/cobra v1.5.0 github.com/spf13/viper v1.12.0 @@ -53,7 +54,6 @@ require ( github.com/pelletier/go-toml v1.9.5 // indirect github.com/pelletier/go-toml/v2 v2.0.1 // indirect github.com/prometheus/client_model v0.2.0 // indirect - github.com/prometheus/common v0.37.0 // indirect github.com/prometheus/procfs v0.8.0 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect github.com/spf13/afero v1.8.2 // indirect diff --git a/go.sum b/go.sum index f8a59e2..998de50 100644 --- a/go.sum +++ b/go.sum @@ -191,6 +191,7 @@ github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NH github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= @@ -237,6 +238,7 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= diff --git a/internal/testing/e2e_test.go b/internal/testing/e2e_test.go index 31c5d28..1348575 100644 --- a/internal/testing/e2e_test.go +++ b/internal/testing/e2e_test.go @@ -12,12 +12,19 @@ governing permissions and limitations under the License. package e2e import ( + "context" + "fmt" "github.com/adobe/k8s-shredder/pkg/config" "github.com/adobe/k8s-shredder/pkg/handler" "github.com/adobe/k8s-shredder/pkg/utils" + "github.com/prometheus/client_golang/api" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" log "github.com/sirupsen/logrus" "golang.org/x/exp/slices" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "os" + "strings" "testing" "time" ) @@ -91,6 +98,51 @@ func compareTime(expirationTime time.Time, t *testing.T, ch chan time.Time) { } func TestShredderMetrics(t *testing.T) { - // TODO add metrics validation tests - t.Log("Metrics validation test passed!") + + var warnings []string + var results []string + + // Intentionally skipped the gauge metrics as they are going to be wiped out before every eviction loop + shredderMetrics := []string{ + "shredder_loops_total", + "shredder_loops_duration_seconds", + "shredder_processed_nodes_total", + "shredder_processed_pods_total", + "shredder_errors_total", + } + + for _, shredderMetric := range shredderMetrics { + result, warning, err := prometheusQuery(shredderMetric) + if err != nil { + t.Errorf("Error querying Prometheus: %v\n", err) + } + warnings = append(warnings, warning...) + results = append(results, result.String()) + } + + if len(warnings) > 0 { + t.Logf("Warnings: %v\n", strings.Join(warnings, "\n")) + } + + t.Logf("Results: \n%v\n", strings.Join(results, "\n")) + + if len(results) == len(shredderMetrics) { + t.Log("Metrics validation test passed!") + } +} + +func prometheusQuery(query string) (model.Value, v1.Warnings, error) { + + client, err := api.NewClient(api.Config{ + Address: "http://localhost:30007", + }) + if err != nil { + fmt.Printf("Error creating client: %v\n", err) + os.Exit(1) + } + + v1api := v1.NewAPI(client) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + return v1api.Query(ctx, query, time.Now(), v1.WithTimeout(5*time.Second)) } diff --git a/internal/testing/kind.yaml b/internal/testing/kind.yaml index 4d41bb4..fb7bbdc 100644 --- a/internal/testing/kind.yaml +++ b/internal/testing/kind.yaml @@ -5,6 +5,9 @@ networking: apiServerAddress: 0.0.0.0 nodes: - role: control-plane + extraPortMappings: + - containerPort: 30007 + hostPort: 30007 kubeadmConfigPatches: - | kind: InitConfiguration diff --git a/internal/testing/prometheus_stuffs.yaml b/internal/testing/prometheus_stuffs.yaml index c13c379..52224df 100644 --- a/internal/testing/prometheus_stuffs.yaml +++ b/internal/testing/prometheus_stuffs.yaml @@ -52,11 +52,13 @@ metadata: name: prometheus namespace: kube-system spec: + type: NodePort selector: app: prometheus ports: - port: 9090 targetPort: 9090 + nodePort: 30007 --- apiVersion: v1 kind: ConfigMap diff --git a/pkg/handler/handler.go b/pkg/handler/handler.go index e9fee78..27d4206 100644 --- a/pkg/handler/handler.go +++ b/pkg/handler/handler.go @@ -26,7 +26,7 @@ import ( log "github.com/sirupsen/logrus" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" - policy "k8s.io/api/policy/v1beta1" + policy "k8s.io/api/policy/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" @@ -69,9 +69,16 @@ func NewHandler(appContext *utils.AppContext) *Handler { // Run starts an eviction loop func (h *Handler) Run() error { + // start measuring the loop duration loopTimer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) { metrics.ShredderLoopsDurationSeconds.Observe(v * 10e6) })) + + // reset gauge metrics + metrics.ShredderNodeForceToEvictTime.Reset() + metrics.ShredderPodForceToEvictTime.Reset() + metrics.ShredderPodErrorsTotal.Reset() + h.logger.Infof("Starting eviction loop") // sync all nodes goroutines @@ -208,7 +215,14 @@ func (h *Handler) processNode(node v1.Node, rr chan *controllerObject) error { h.logger.WithFields(log.Fields{ "namespace": pod.Namespace, "pod": pod.Name, - }).Warnf("Failed to get pod controller object: %s", err.Error()) + }).Warnf("Failed to get pod controller object: %s. Proceeding directly with pod eviction", err.Error()) + err := h.evictPod(pod, deleteOptions) + if err != nil { + h.logger.WithFields(log.Fields{ + "namespace": pod.Namespace, + "pod": pod.Name, + }).Warnf("Failed to evict pod: %s", err.Error()) + } continue } @@ -296,8 +310,7 @@ func (h *Handler) GetPodsForNode(node v1.Node) ([]v1.Pod, error) { // evictPod evict a pod using the eviction API func (h *Handler) evictPod(pod v1.Pod, deleteOptions *metav1.DeleteOptions) error { h.logger.Infof("Evicting pod %s from %s namespace", pod.Name, pod.Namespace) - // TODO switch to stable V1 API version once we switch to k8s 1.22 - err := h.appContext.K8sClient.PolicyV1beta1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{ + err := h.appContext.K8sClient.PolicyV1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{ ObjectMeta: metav1.ObjectMeta{ Name: pod.Name, Namespace: pod.Namespace, @@ -367,9 +380,9 @@ func (h *Handler) getControllerObject(pod v1.Pod) (*controllerObject, error) { return co, err } return newControllerObject("StatefulSet", sts.Name, sts.Namespace, sts), nil + default: + return co, errors.Errorf("Controller object of type %s is not a standard controller", pod.OwnerReferences[0].Kind) } - - return co, errors.Errorf("could not find controller object for type %s\n", pod.OwnerReferences[0].Kind) } func (h *Handler) isRolloutRestartInProgress(co *controllerObject) (bool, error) { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index d5f21bd..5300f6d 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -78,10 +78,10 @@ var ( ) // ShredderPodErrorsTotal = Total pod errors - ShredderPodErrorsTotal = prometheus.NewCounterVec( - prometheus.CounterOpts{ + ShredderPodErrorsTotal = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ Name: "shredder_pod_errors_total", - Help: "Total pod errors", + Help: "Total pod errors per eviction loop", }, []string{"pod_name", "namespace", "reason", "action"}, )