Skip to content

Commit

Permalink
feat: add initial prometheus metrics support
Browse files Browse the repository at this point in the history
---
Metrics for the prometheus endpoint are slightly different from the json one.
- I do not want to break things for those that use the old endpoint.
- Some metrics are hard to utilize in Prometheus without a format
  change. (example: certificate timestamp)
  - Usually blackbox_exporter would cover certificate monitoring, but
    since people use TRMM with multiple proxies both internal monitoring
    and blackbox_exporter may be useful.
- Memory and disk space was not transfered because they are system wide
  and Prometheus users likely will have node_exporter installed for
  that.
- Prometheus format has slightly more detailed metrics.

---
Instructions:
1. Setup MON_TOKEN variable in
   `/rmm/api/tacticalrmm/tacticalrmm/local_settings.py` for your bearer_token.
   This is the same as for the json endpoint. See [Tips and Tricks](https://docs.tacticalrmm.com/tipsntricks/#monitor-your-trmm-instance-via-the-built-in-monitoring-endpoint).
2. Test with curl command:
   `curl -s -H "Authorization: Bearer $MON_TOKEN" https://api.trmm.example.com/core/status/`
3. Setup Prometheus job with:
```
- job_name: trmm
  scrape_interval: 60s
  metrics_path: /core/status/
  scheme: https
  bearer_token: $MON_TOKEN
  static_configs:
  - targets:
    - api.trmm.example.com
```
  • Loading branch information
ykuksenko committed Apr 1, 2023
1 parent b28316a commit 2e02f78
Show file tree
Hide file tree
Showing 3 changed files with 299 additions and 48 deletions.
38 changes: 25 additions & 13 deletions api/tacticalrmm/core/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,35 @@

def monitoring_view(function):
def wrap(request, *args, **kwargs):
if request.method != "POST":
return HttpResponse("Invalid request type\n", status=400)
if request.method == "POST":
try:
data = json.loads(request.body)
except:
return HttpResponse("Invalid json\n", status=400)

if "auth" not in data.keys():
return HttpResponse("Invalid payload\n", status=400)

token = getattr(settings, "MON_TOKEN", "")
if not token:
return HttpResponse("Missing token\n", status=401)

try:
data = json.loads(request.body)
except:
return HttpResponse("Invalid json\n", status=400)
if data.get("auth") != token:
return HttpResponse("Not authenticated\n", status=401)

if "auth" not in data.keys():
return HttpResponse("Invalid payload\n", status=400)
elif request.method == "GET":
if "Authorization" not in request.headers:
return HttpResponse("Missing 'Authorization' header\n", status=400)

token = getattr(settings, "MON_TOKEN", "")
if not token:
return HttpResponse("Missing token\n", status=401)
token = getattr(settings, "MON_TOKEN", "")
if not token:
return HttpResponse("Missing token\n", status=401)

if data.get("auth") != token:
return HttpResponse("Not authenticated\n", status=401)
if request.headers["Authorization"] != "Bearer " + token:
return HttpResponse("Not authenticated\n", status=401)

else:
return HttpResponse("Invalid request type\n", status=400)

return function(request, *args, **kwargs)

Expand Down
129 changes: 129 additions & 0 deletions api/tacticalrmm/core/tests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest.mock import patch

import tempfile
import requests
from channels.db import database_sync_to_async
from channels.testing import WebsocketCommunicator
Expand Down Expand Up @@ -500,3 +501,131 @@ def test_get_meshagent_url_docker(self):
r,
"http://tactical-meshcentral:4443/meshagents?id=4&meshid=abc123&installflags=0",
)


class TestMonitoring(TacticalTestCase):
url = "/core/status/"

def setUp(self):
self.setup_client()
self.setup_coresettings()

# Generate snakeoil cert with `make-ssl-cert generate-default-snakeoil` on ubuntu
# Cert will be in '/etc/ssl/certs/ssl-cert-snakeoil.pem'.
# Cert is used only for expiration date, so it can be selfsigned, expired and no key is needed.
self.snakeoil_certificate = tempfile.NamedTemporaryFile(delete=False)
self.snakeoil_certificate.write(
"""-----BEGIN CERTIFICATE-----
MIIC4jCCAcqgAwIBAgIUCFgTym78sGgRHwEmLyGgmr1JjSUwDQYJKoZIhvcNAQEL
BQAwFzEVMBMGA1UEAwwMZjNjMTQzOWM0NzZjMB4XDTIzMDMzMTA1MTgzOFoXDTMz
MDMyODA1MTgzOFowFzEVMBMGA1UEAwwMZjNjMTQzOWM0NzZjMIIBIjANBgkqhkiG
9w0BAQEFAAOCAQ8AMIIBCgKCAQEAzFWItB4aM/aUWIhk0SS1XKHLHao9/OwbGHet
lnrlZD2YM/DdUzqdYeYdujyLvWUj1xU+YcFv+vo3Mmu8HQVOKNcEZ5ZilHW/87X8
6ZjtUzPYmCapxXNTX8yh2EES582uq64j0t3OwfaCJmpJLwjvCnrizfUFe76iy5Ge
wVviYtkaIfHEwNoJLmFb07rYhNuV4tiwHUhmZqqm5nxpjKbTsI4YHnpSxNktU32C
vNVnIRIAHDZ8n8wCaKTPZMui9X/IJx1pA3EkbD2givbH/0nYRcd5ZUDxLsTJThob
8k5kPd1zVXqaH/ufqkekqoiY+kIWsgVd0iWx3qihhydAhRY5SQIDAQABoyYwJDAJ
BgNVHRMEAjAAMBcGA1UdEQQQMA6CDGYzYzE0MzljNDc2YzANBgkqhkiG9w0BAQsF
AAOCAQEAH91bAuK3tKf1v4D+t48SWSE2uFjCe6o2CzMwAdM3rVa47X2cw5nKOH5L
8nQJhJjq/t93DJi4WOpN579NWtTkwXyCl7srSvj8aK4FDKxKcWQNT1PUAa+gh8IB
WJdEK4lMSatCtA/wsq6jmkTwINZ/ELZp4BRU2gUp8mFU9fVQDMlY+2qwUzzIp97A
WISWVxML58FDFnQLsaP1SfapVWTTXTh4xnhr7VxklUadcGRnx9+Ig4Ieq27eSCiV
DC/aSRIyi9HaVZPTMbqLC50auHr/dQIL4pGyxFTD8OJoeRkQgAb1wWuAPhab20Xu
XyFzZMiRlyNNSPoYVExb65s1bawqew==
-----END CERTIFICATE-----""".encode(
encoding="utf-8"
)
)
self.snakeoil_certificate.close()

def tearDown(self):
from os import unlink

unlink(self.snakeoil_certificate.name)

# prometheus tests
def test_prometheus_missing_auth_header_request(self):
r = self.client.get(self.url)
self.assertEqual(r.status_code, 400)

def test_prometheus_missing_token_config(self):
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
self.assertEqual(r.status_code, 401)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_prometheus_incorrect_token_request(self):
r = self.client.get(self.url, HTTP_Authorization="Bearer NotMySuperTestSecret")
self.assertEqual(r.status_code, 401)

@override_settings(DOCKER_BUILD=True, MON_TOKEN="MySuperTestSecret")
def test_prometheus_correct_docker_build_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
self.assertEqual(r.status_code, 200)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_prometheus_correct_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
self.assertEqual(r.status_code, 200)

# invalid tests
def test_invalid_request(self):
r = self.client.put(self.url)
self.assertEqual(r.status_code, 400)
self.assertEqual(
r.content,
b"Invalid request type\n",
)

# json tests
def test_json_invalid_json_request(self):
r = self.client.post(
self.url,
data="I am not json!",
content_type="application/json",
)
self.assertEqual(r.status_code, 400)

def test_json_invalid_payload_request(self):
r = self.client.post(
self.url, data={"notauth": "NotMySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 400)

def test_json_missing_token_request(self):
r = self.client.post(
self.url, data={"auth": "MySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 401)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_json_incorrect_token_request(self):
r = self.client.post(
self.url, data={"auth": "NotMySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 401)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_json_correct_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.post(
self.url, data={"auth": "MySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 200)

@override_settings(DOCKER_BUILD=True, MON_TOKEN="MySuperTestSecret")
def test_json_correct_docker_build_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.post(
self.url, data={"auth": "MySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 200)
180 changes: 145 additions & 35 deletions api/tacticalrmm/core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytz
from cryptography import x509
from django.conf import settings
from django.http import JsonResponse
from django.http import JsonResponse, HttpResponse
from django.shortcuts import get_object_or_404
from django.utils import timezone as djangotime
from django.views.decorators.csrf import csrf_exempt
Expand Down Expand Up @@ -409,43 +409,153 @@ def status(request):
from agents.models import Agent
from clients.models import Client, Site

disk_usage: int = round(psutil.disk_usage("/").percent)
mem_usage: int = round(psutil.virtual_memory().percent)

# common metrics bits
cert_file, _ = get_certs()
cert_bytes = Path(cert_file).read_bytes()

cert = x509.load_pem_x509_certificate(cert_bytes)
expires = pytz.utc.localize(cert.not_valid_after)
now = djangotime.now()
delta = expires - now

ret = {
"version": settings.TRMM_VERSION,
"latest_agent_version": settings.LATEST_AGENT_VER,
"agent_count": Agent.objects.count(),
"client_count": Client.objects.count(),
"site_count": Site.objects.count(),
"disk_usage_percent": disk_usage,
"mem_usage_percent": mem_usage,
"days_until_cert_expires": delta.days,
"cert_expired": delta.days < 0,

# common services
services = {
"django": "rmm.service",
"mesh": "meshcentral.service",
"daphne": "daphne.service",
"celery": "celery.service",
"celerybeat": "celerybeat.service",
"redis": "redis-server.service",
"postgres": "postgresql.service",
"mongo": "mongod.service",
"nats": "nats.service",
"nats-api": "nats-api.service",
"nginx": "nginx.service",
}

if settings.DOCKER_BUILD:
ret["services_running"] = "not available in docker"
else:
ret["services_running"] = {
"django": sysd_svc_is_running("rmm.service"),
"mesh": sysd_svc_is_running("meshcentral.service"),
"daphne": sysd_svc_is_running("daphne.service"),
"celery": sysd_svc_is_running("celery.service"),
"celerybeat": sysd_svc_is_running("celerybeat.service"),
"redis": sysd_svc_is_running("redis-server.service"),
"postgres": sysd_svc_is_running("postgresql.service"),
"mongo": sysd_svc_is_running("mongod.service"),
"nats": sysd_svc_is_running("nats.service"),
"nats-api": sysd_svc_is_running("nats-api.service"),
"nginx": sysd_svc_is_running("nginx.service"),
# TRMM json monitoring
if request.method == "POST":
disk_usage: int = round(psutil.disk_usage("/").percent)
mem_usage: int = round(psutil.virtual_memory().percent)

cert_expires = pytz.utc.localize(cert.not_valid_after)
now = djangotime.now()
delta = cert_expires - now

ret = {
"version": settings.TRMM_VERSION,
"latest_agent_version": settings.LATEST_AGENT_VER,
"agent_count": Agent.objects.count(),
"client_count": Client.objects.count(),
"site_count": Site.objects.count(),
"disk_usage_percent": disk_usage,
"mem_usage_percent": mem_usage,
"days_until_cert_expires": delta.days,
"cert_expired": delta.days < 0,
}
return JsonResponse(ret, json_dumps_params={"indent": 2})

if settings.DOCKER_BUILD:
ret["services_running"] = "not available in docker"
else:
ret["services_running"] = {}
for k, v in services.items():
ret["services_running"][k] = sysd_svc_is_running(v)
return JsonResponse(ret, json_dumps_params={"indent": 2})

# TRMM Prometheus monitoring
elif request.method == "GET":
# get agent counts
from clients.serializers import ClientSerializer
from django.db.models import Count, Prefetch

agent_counts = ClientSerializer(
Client.objects.order_by("name").prefetch_related(
Prefetch(
"sites",
queryset=Site.objects.order_by("name")
.select_related("client")
.annotate(agent_count=Count("agents")),
to_attr="filtered_sites",
)
),
many=True,
).data

# generate agent count metrics
agent_count_metrics = []
for client in agent_counts:
for site in client["sites"]:
agent_count_metrics.append(
(
{"client": client["name"], "site": site["name"]},
site["agent_count"],
)
)

# create base prometheus metric dataset
metrics = {
"trmm_buildinfo": {
"type": "gauge",
"help": "trmm version",
"entries": [({"version": settings.TRMM_VERSION}, 1)],
},
"trmm_meshinfo": {
"type": "gauge",
"help": "meshcentral version",
"entries": [({"version": settings.MESH_VER}, 1)],
},
"trmm_natsinfo": {
"type": "gauge",
"help": "nats version",
"entries": [({"version": settings.NATS_SERVER_VER}, 1)],
},
"trmm_appinfo": {
"type": "gauge",
"help": "vue version",
"entries": [({"version": settings.APP_VER}, 1)],
},
"trmm_agentinfo": {
"type": "gauge",
"help": "latest version of trmm agent",
"entries": [({"version": settings.LATEST_AGENT_VER}, 1)],
},
"trmm_agents": {
"type": "gauge",
"help": "number of registered agents in trmm",
"entries": agent_count_metrics,
},
"trmm_cert_expiry": {
"type": "gauge",
"help": "unix timestamp of certificate expiration",
"entries": [({}, cert.not_valid_after.timestamp())],
},
}

# add service metrics if this is not a docker build
if not settings.DOCKER_BUILD:
e = []
for k, v in services.items():
e.append(({"name": v, "service": k}, int(sysd_svc_is_running(v))))

metrics["trmm_systemd_unit_state"] = {
"type": "gauge",
"help": "trmm service status for non docker builds",
"entries": e,
}

# render prometheus metrics
payload = ""
for metric, data in metrics.items():
# create help and type hints
if "help" in data:
payload += "# HELP {} {}\n".format(metric, data["help"])
payload += "# TYPE {} {}\n".format(metric, data["type"])
# populate the metrics
for labels, value in data["entries"]:
label_string = ",".join(
['{}="{}"'.format(i[0], i[1]) for i in labels.items()]
)
if label_string != "":
label_string = "{{{}}}".format(label_string)
payload += "{}{} {}\n".format(metric, label_string, value)
return HttpResponse(payload, content_type="text/plain")

# The monitoring_view decorator should prevent this state from ever occuring.
else:
return HttpResponse("It should not be possible to be here.\n", status=500)

0 comments on commit 2e02f78

Please sign in to comment.