Skip to content

Commit

Permalink
Workers self monitoring (metrics and alert rules) (#50)
Browse files Browse the repository at this point in the history
* sync alert rules with mimir repo:
https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml

* sync rules with mimir repo:
https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/rules.yaml

* fix typo

* add nginx-prometheus-exporter

* int to str ports

* move mimir rules to specific directory

* add nginx rules and a new relation for nginx metrics

* add nginx-prometheus-exporter-image to itest

* fixing test_tls

* add scheme and avoid ssl verification in nginx-prometheus-exporter

* move pebble layer setup to workload's managers

* add explicit nginx prometheus exporter port

* improve tls handling

* let do tls mre robust

* update lisbs

* cos-tool render rules

* refactor alert rules renderization

* upate prometheus scrape lib

* merge both metrics_endpoint relation into one

* remove missing breakpoint()

* remove missing cos_tool

* change relation name

* add missing nginx pebble ready configuration

* refactor/remove _ensure_consolidated_rules_dir

* add cos-tool-* to .gitignore

* add set_can_connect

* add itest to check scrape jobs and rules

* linting

* fix test_charm.py itest

* update loki_push_api lib

* add some debug logging

* add con_connect guard

* modify some logging

* add logging and supress progress bar in curl

* missing -sS in curl command
  • Loading branch information
Abuelodelanada committed Apr 19, 2024
1 parent 8c194d5 commit 937f48d
Show file tree
Hide file tree
Showing 10 changed files with 228 additions and 30 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ __pycache__/
.idea
.vscode/
*.egg-info/
cos-tool-*
9 changes: 9 additions & 0 deletions charmcraft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,12 @@ parts:
# For v2.tls_certificates
- cryptography
- jsonschema

cos-tool:
plugin: dump
source: .
build-packages:
- curl
override-pull: |
curl -L -O https://github.com/canonical/cos-tool/releases/latest/download/cos-tool-${CRAFT_TARGET_ARCH}
chmod +x cos-tool-*
4 changes: 3 additions & 1 deletion lib/charms/loki_k8s/v1/loki_push_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,9 @@ def _alert_rules_error(self, event):

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 7
LIBPATCH = 8

PYDEPS = ["cosl"]

logger = logging.getLogger(__name__)

Expand Down
9 changes: 2 additions & 7 deletions metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,7 @@ provides:
grafana-source:
interface: grafana_datasource

metrics-endpoint:
self-metrics-endpoint:
interface: prometheus_scrape
description: |
The coordinator provides scrape jobs for itself.
workers-metrics-endpoint:
interface: prometheus_scrape
description: |
The coordinator provides scrape jobs for all the workers related to it.
The coordinator provides scrape jobs for itself and for the workers.
90 changes: 74 additions & 16 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@
https://discourse.charmhub.io/t/4208
"""
import glob
import json
import logging
import os
import shutil
import socket
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional

import ops
import yaml
from charms.data_platform_libs.v0.s3 import (
S3Requirer,
)
Expand All @@ -29,6 +33,8 @@
from charms.tempo_k8s.v1.charm_tracing import trace_charm
from charms.tempo_k8s.v1.tracing import TracingEndpointRequirer
from charms.traefik_k8s.v2.ingress import IngressPerAppReadyEvent, IngressPerAppRequirer
from cosl import JujuTopology
from cosl.rules import AlertRules
from mimir_cluster import MimirClusterProvider
from mimir_config import BUCKET_NAME, S3_RELATION_NAME, _S3ConfigData
from mimir_coordinator import MimirCoordinator
Expand All @@ -41,6 +47,10 @@
# Log messages can be retrieved using juju debug-log
logger = logging.getLogger(__name__)

NGINX_ORIGINAL_ALERT_RULES_PATH = "./src/prometheus_alert_rules/nginx"
WORKER_ORIGINAL_ALERT_RULES_PATH = "./src/prometheus_alert_rules/mimir_workers"
CONSOLIDATED_ALERT_RULES_PATH = "./src/prometheus_alert_rules/consolidated_rules"


@trace_charm(
tracing_endpoint="tempo_endpoint",
Expand All @@ -61,7 +71,6 @@ def __init__(self, *args: Any):
# TODO: On any worker relation-joined/departed, need to updade grafana agent's scrape
# targets with the new memberlist.
# (Remote write would still be the same nginx-proxied endpoint.)

self._nginx_container = self.unit.get_container("nginx")
self._nginx_prometheus_exporter_container = self.unit.get_container(
"nginx-prometheus-exporter"
Expand Down Expand Up @@ -101,17 +110,19 @@ def __init__(self, *args: Any):
secure_extra_fields={"httpHeaderValue1": "anonymous"},
)
self.loki_consumer = LokiPushApiConsumer(self, relation_name="logging-consumer")
self.worker_metrics_endpoints = MetricsEndpointProvider(
self,
relation_name="workers-metrics-endpoint",
alert_rules_path="./src/prometheus_alert_rules/mimir_workers",
jobs=self.workers_scrape_jobs,
)
self.nginx_metrics_endpoints = MetricsEndpointProvider(

self._consolidate_nginx_rules()
self.metrics_endpoints = MetricsEndpointProvider(
self,
relation_name="metrics-endpoint",
alert_rules_path="./src/prometheus_alert_rules/nginx",
jobs=self.nginx_scrape_jobs,
relation_name="self-metrics-endpoint",
alert_rules_path=CONSOLIDATED_ALERT_RULES_PATH,
jobs=self._scrape_jobs,
refresh_event=[
self.on.mimir_cluster_relation_joined,
self.on.mimir_cluster_relation_changed,
self.on.mimir_cluster_relation_departed,
self.on.mimir_cluster_relation_broken,
],
)
self.ingress = IngressPerAppRequirer(charm=self, strip_prefix=True)

Expand All @@ -128,7 +139,7 @@ def __init__(self, *args: Any):
self.framework.observe(self.server_cert.on.cert_changed, self._on_server_cert_changed)
# Mimir Cluster
self.framework.observe(
self.on.mimir_cluster_relation_joined, self._on_mimir_cluster_changed
self.on.mimir_cluster_relation_joined, self._on_mimir_cluster_joined
)
self.framework.observe(
self.on.mimir_cluster_relation_changed, self._on_mimir_cluster_changed
Expand Down Expand Up @@ -158,19 +169,28 @@ def __init__(self, *args: Any):

def _on_config_changed(self, _: ops.ConfigChangedEvent):
"""Handle changed configuration."""
self.nginx.configure_pebble_layer(tls=self._is_tls_ready)
self._render_workers_alert_rules()
self._update_mimir_cluster()

def _on_server_cert_changed(self, _):
self._update_cert()
self.nginx.configure_pebble_layer(tls=self._is_tls_ready)
self._update_mimir_cluster()

def _on_mimir_cluster_joined(self, _):
self.nginx.configure_pebble_layer(tls=self._is_tls_ready)
self._render_workers_alert_rules()
self._update_mimir_cluster()

def _on_mimir_cluster_changed(self, _):
self.nginx.configure_pebble_layer(tls=self._is_tls_ready)
self._render_workers_alert_rules()
self._update_mimir_cluster()

def _on_mimir_cluster_departed(self, _):
self.nginx.configure_pebble_layer(tls=self._is_tls_ready)
self._render_workers_alert_rules()
self._update_mimir_cluster()

def _on_s3_changed(self, _):
Expand Down Expand Up @@ -256,10 +276,10 @@ def mimir_worker_relations(self) -> List[ops.Relation]:
return self.model.relations.get("mimir_worker", [])

@property
def workers_scrape_jobs(self) -> List[Dict[str, Any]]:
"""Scrape jobs for the Mimir workers."""
def _workers_scrape_jobs(self) -> List[Dict[str, Any]]:
scrape_jobs = []
worker_topologies = self.cluster_provider.gather_topology()

for worker in worker_topologies:
job = {
"static_configs": [
Expand All @@ -282,13 +302,16 @@ def workers_scrape_jobs(self) -> List[Dict[str, Any]]:
return scrape_jobs

@property
def nginx_scrape_jobs(self) -> List[Dict[str, Any]]:
"""Scrape jobs for the Mimir Coordinator."""
def _nginx_scrape_jobs(self) -> List[Dict[str, Any]]:
job: Dict[str, Any] = {
"static_configs": [{"targets": [f"{self.hostname}:{NGINX_PROMETHEUS_EXPORTER_PORT}"]}]
}
return [job]

@property
def _scrape_jobs(self) -> List[Dict[str, Any]]:
return self._workers_scrape_jobs + self._nginx_scrape_jobs

@property
def loki_endpoints_by_unit(self) -> Dict[str, str]:
"""Loki endpoints from relation data in the format needed for Pebble log forwarding.
Expand Down Expand Up @@ -347,6 +370,41 @@ def external_url(self) -> str:
# === UTILITY METHODS === #
###########################

def _render_workers_alert_rules(self):
self._remove_rendered_rules()

apps = set()
for worker in self.cluster_provider.gather_topology():
if worker["app"] in apps:
continue

apps.add(worker["app"])
topology_dict = {
"model": self.model.name,
"model_uuid": self.model.uuid,
"application": worker["app"],
"unit": worker["unit"],
"charm_name": "mimir-worker-k8s",
}
topology = JujuTopology.from_dict(topology_dict)
alert_rules = AlertRules(query_type="promql", topology=topology)
alert_rules.add_path(WORKER_ORIGINAL_ALERT_RULES_PATH, recursive=True)
alert_rules_contents = yaml.dump(alert_rules.as_dict())

file_name = f"{CONSOLIDATED_ALERT_RULES_PATH}/rendered_{worker['app']}.rules"
with open(file_name, "w") as writer:
writer.write(alert_rules_contents)

def _remove_rendered_rules(self):
files = glob.glob(f"{CONSOLIDATED_ALERT_RULES_PATH}/rendered_*")
for f in files:
os.remove(f)

def _consolidate_nginx_rules(self):
os.makedirs(CONSOLIDATED_ALERT_RULES_PATH, exist_ok=True)
for filename in glob.glob(os.path.join(NGINX_ORIGINAL_ALERT_RULES_PATH, "*.*")):
shutil.copy(filename, f"{CONSOLIDATED_ALERT_RULES_PATH}/")

def _update_mimir_cluster(self): # common exit hook
"""Build the config and publish everything to the application databag."""
if not self.coordinator.is_coherent():
Expand Down
11 changes: 6 additions & 5 deletions src/nginx.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,12 @@ def __init__(self, charm: CharmBase, cluster_provider: MimirClusterProvider, ser

def configure_pebble_layer(self, tls: bool) -> None:
"""Configure pebble layer."""
self._container.push(
self.config_path, self.config(tls=tls), make_dirs=True # type: ignore
)
self._container.add_layer("nginx", self.layer, combine=True)
self._container.autostart()
if self._container.can_connect():
self._container.push(
self.config_path, self.config(tls=tls), make_dirs=True # type: ignore
)
self._container.add_layer("nginx", self.layer, combine=True)
self._container.autostart()

def config(self, tls: bool = False) -> str:
"""Build and return the Nginx configuration."""
Expand Down
11 changes: 11 additions & 0 deletions tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,14 @@ def get_workload_file(
logger.error(e.stdout.decode())
raise e
return res.stdout


async def run_command(model_name: str, app_name: str, unit_num: int, command: list) -> bytes:
cmd = ["juju", "ssh", "--model", model_name, f"{app_name}/{unit_num}", *command]
try:
res = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
logger.info(res)
except subprocess.CalledProcessError as e:
logger.error(e.stdout.decode())
raise e
return res.stdout
2 changes: 1 addition & 1 deletion tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async def test_build_and_deploy(ops_test: OpsTest):
relations:
- [mc:logging-consumer, loki:logging]
- [mc:metrics-endpoint, prometheus:metrics-endpoint]
- [mc:self-metrics-endpoint, prometheus:metrics-endpoint]
- [mc:grafana-dashboards-provider, grafana:grafana-dashboard]
"""
)
Expand Down
120 changes: 120 additions & 0 deletions tests/integration/test_self_monitoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env python3
# Copyright 2024 Ubuntu
# See LICENSE file for licensing details.

import json
import logging
from pathlib import Path
from textwrap import dedent
from types import SimpleNamespace

import pytest
import yaml
from helpers import deploy_literal_bundle, run_command
from pytest_operator.plugin import OpsTest

logger = logging.getLogger(__name__)

METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
coord = SimpleNamespace(name="coord")
apps = ["coord", "write", "read", "prom"]


@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest):
"""Build the charm-under-test and deploy it together with related charms."""
charm = await ops_test.build_charm(".")

test_bundle = dedent(
f"""
---
bundle: kubernetes
name: test-charm
applications:
{coord.name}:
charm: {charm}
trust: true
resources:
nginx-image: {METADATA["resources"]["nginx-image"]["upstream-source"]}
nginx-prometheus-exporter-image: {METADATA["resources"]["nginx-prometheus-exporter-image"]["upstream-source"]}
scale: 1
prom:
charm: prometheus-k8s
channel: edge
scale: 1
trust: true
read:
charm: mimir-worker-k8s
channel: edge
scale: 1
constraints: arch=amd64
options:
alertmanager: true
compactor: true
querier: true
query-frontend: true
query-scheduler: true
ruler: true
store-gateway: true
trust: true
write:
charm: mimir-worker-k8s
channel: edge
scale: 1
constraints: arch=amd64
options:
compactor: true
distributor: true
ingester: true
trust: true
relations:
- - prom:metrics-endpoint
- coord:self-metrics-endpoint
- - coord:mimir-cluster
- read:mimir-cluster
- - coord:mimir-cluster
- write:mimir-cluster
"""
)

# Deploy the charm and wait for active/idle status
await deploy_literal_bundle(ops_test, test_bundle) # See appendix below
await ops_test.model.wait_for_idle(
apps=["read", "write", "prom"],
status="active",
raise_on_error=False,
timeout=600,
idle_period=30,
)

await ops_test.model.wait_for_idle(
apps=[coord.name], status="blocked", raise_on_error=False, timeout=600, idle_period=30
)


@pytest.mark.abort_on_fail
async def test_scrape_jobs(ops_test: OpsTest):
# Check scrape jobs
cmd = ["curl", "-sS", "http://localhost:9090/api/v1/targets"]
result = await run_command(ops_test.model_name, "prom", 0, command=cmd)
logger.info(result)
result_json = json.loads(result.decode("utf-8"))

active_targets = result_json["data"]["activeTargets"]

for at in active_targets:
assert at["labels"]["juju_application"] in apps


@pytest.mark.abort_on_fail
async def test_rules(ops_test: OpsTest):
# Check Rules
cmd = ["curl", "-sS", "http://localhost:9090/api/v1/rules"]
result = await run_command(ops_test.model_name, "prom", 0, command=cmd)
logger.info(result)
result_json = json.loads(result.decode("utf-8"))
groups = result_json["data"]["groups"]

for group in groups:
for rule in group["rules"]:
assert rule["labels"]["juju_application"] in apps

0 comments on commit 937f48d

Please sign in to comment.