Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix metrics and handle pods baked by custom controllers #12

Merged
merged 1 commit into from
Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/go-co-op/gocron v1.16.2
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.13.0
github.com/prometheus/common v0.37.0
github.com/sirupsen/logrus v1.9.0
github.com/spf13/cobra v1.5.0
github.com/spf13/viper v1.12.0
Expand Down Expand Up @@ -53,7 +54,6 @@ require (
github.com/pelletier/go-toml v1.9.5 // indirect
github.com/pelletier/go-toml/v2 v2.0.1 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/spf13/afero v1.8.2 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NH
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
Expand Down Expand Up @@ -237,6 +238,7 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
Expand Down
56 changes: 54 additions & 2 deletions internal/testing/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,19 @@ governing permissions and limitations under the License.
package e2e

import (
"context"
"fmt"
"github.com/adobe/k8s-shredder/pkg/config"
"github.com/adobe/k8s-shredder/pkg/handler"
"github.com/adobe/k8s-shredder/pkg/utils"
"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
log "github.com/sirupsen/logrus"
"golang.org/x/exp/slices"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"os"
"strings"
"testing"
"time"
)
Expand Down Expand Up @@ -91,6 +98,51 @@ func compareTime(expirationTime time.Time, t *testing.T, ch chan time.Time) {
}

func TestShredderMetrics(t *testing.T) {
// TODO add metrics validation tests
t.Log("Metrics validation test passed!")

var warnings []string
var results []string

// Intentionally skipped the gauge metrics as they are going to be wiped out before every eviction loop
shredderMetrics := []string{
"shredder_loops_total",
"shredder_loops_duration_seconds",
"shredder_processed_nodes_total",
"shredder_processed_pods_total",
"shredder_errors_total",
}

for _, shredderMetric := range shredderMetrics {
result, warning, err := prometheusQuery(shredderMetric)
if err != nil {
t.Errorf("Error querying Prometheus: %v\n", err)
}
warnings = append(warnings, warning...)
results = append(results, result.String())
}

if len(warnings) > 0 {
t.Logf("Warnings: %v\n", strings.Join(warnings, "\n"))
}

t.Logf("Results: \n%v\n", strings.Join(results, "\n"))

if len(results) == len(shredderMetrics) {
t.Log("Metrics validation test passed!")
}
}

func prometheusQuery(query string) (model.Value, v1.Warnings, error) {

client, err := api.NewClient(api.Config{
Address: "http://localhost:30007",
})
if err != nil {
fmt.Printf("Error creating client: %v\n", err)
os.Exit(1)
}

v1api := v1.NewAPI(client)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
return v1api.Query(ctx, query, time.Now(), v1.WithTimeout(5*time.Second))
}
3 changes: 3 additions & 0 deletions internal/testing/kind.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ networking:
apiServerAddress: 0.0.0.0
nodes:
- role: control-plane
extraPortMappings:
- containerPort: 30007
hostPort: 30007
kubeadmConfigPatches:
- |
kind: InitConfiguration
Expand Down
2 changes: 2 additions & 0 deletions internal/testing/prometheus_stuffs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,13 @@ metadata:
name: prometheus
namespace: kube-system
spec:
type: NodePort
selector:
app: prometheus
ports:
- port: 9090
targetPort: 9090
nodePort: 30007
---
apiVersion: v1
kind: ConfigMap
Expand Down
25 changes: 19 additions & 6 deletions pkg/handler/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
log "github.com/sirupsen/logrus"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1beta1"
policy "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -69,9 +69,16 @@ func NewHandler(appContext *utils.AppContext) *Handler {

// Run starts an eviction loop
func (h *Handler) Run() error {
// start measuring the loop duration
loopTimer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) {
metrics.ShredderLoopsDurationSeconds.Observe(v * 10e6)
}))

// reset gauge metrics
metrics.ShredderNodeForceToEvictTime.Reset()
metrics.ShredderPodForceToEvictTime.Reset()
metrics.ShredderPodErrorsTotal.Reset()

h.logger.Infof("Starting eviction loop")

// sync all nodes goroutines
Expand Down Expand Up @@ -208,7 +215,14 @@ func (h *Handler) processNode(node v1.Node, rr chan *controllerObject) error {
h.logger.WithFields(log.Fields{
"namespace": pod.Namespace,
"pod": pod.Name,
}).Warnf("Failed to get pod controller object: %s", err.Error())
}).Warnf("Failed to get pod controller object: %s. Proceeding directly with pod eviction", err.Error())
err := h.evictPod(pod, deleteOptions)
if err != nil {
h.logger.WithFields(log.Fields{
"namespace": pod.Namespace,
"pod": pod.Name,
}).Warnf("Failed to evict pod: %s", err.Error())
}
continue
}

Expand Down Expand Up @@ -296,8 +310,7 @@ func (h *Handler) GetPodsForNode(node v1.Node) ([]v1.Pod, error) {
// evictPod evict a pod using the eviction API
func (h *Handler) evictPod(pod v1.Pod, deleteOptions *metav1.DeleteOptions) error {
h.logger.Infof("Evicting pod %s from %s namespace", pod.Name, pod.Namespace)
// TODO switch to stable V1 API version once we switch to k8s 1.22
err := h.appContext.K8sClient.PolicyV1beta1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{
err := h.appContext.K8sClient.PolicyV1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
Expand Down Expand Up @@ -367,9 +380,9 @@ func (h *Handler) getControllerObject(pod v1.Pod) (*controllerObject, error) {
return co, err
}
return newControllerObject("StatefulSet", sts.Name, sts.Namespace, sts), nil
default:
return co, errors.Errorf("Controller object of type %s is not a standard controller", pod.OwnerReferences[0].Kind)
}

return co, errors.Errorf("could not find controller object for type %s\n", pod.OwnerReferences[0].Kind)
}

func (h *Handler) isRolloutRestartInProgress(co *controllerObject) (bool, error) {
Expand Down
6 changes: 3 additions & 3 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ var (
)

// ShredderPodErrorsTotal = Total pod errors
ShredderPodErrorsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
ShredderPodErrorsTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "shredder_pod_errors_total",
Help: "Total pod errors",
Help: "Total pod errors per eviction loop",
},
[]string{"pod_name", "namespace", "reason", "action"},
)
Expand Down