Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pybind/mgr/prometheus: add StandbyModule and handle failed MON cluster #19744

Merged
merged 4 commits into from
Jan 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
65 changes: 65 additions & 0 deletions qa/tasks/mgr/test_prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@


from mgr_test_case import MgrTestCase

import logging
import requests


log = logging.getLogger(__name__)


class TestPrometheus(MgrTestCase):
MGRS_REQUIRED = 3

def test_standby(self):
self._assign_ports("prometheus", "server_port")
self._load_module("prometheus")

original_active = self.mgr_cluster.get_active_id()

original_uri = self._get_uri("prometheus")
log.info("Originally running at {0}".format(original_uri))

self.mgr_cluster.mgr_fail(original_active)

failed_over_uri = self._get_uri("prometheus")
log.info("After failover running at {0}".format(failed_over_uri))

self.assertNotEqual(original_uri, failed_over_uri)

# The original active daemon should have come back up as a standby
# and serve some html under "/" and an empty answer under /metrics
r = requests.get(original_uri, allow_redirects=False)
self.assertEqual(r.status_code, 200)
r = requests.get(original_uri + "metrics", allow_redirects=False)
self.assertEqual(r.status_code, 200)
self.assertEqual(r.headers["content-type"], "text/plain;charset=utf-8")

def test_urls(self):
self._assign_ports("prometheus", "server_port")
self._load_module("prometheus")

base_uri = self._get_uri("prometheus")

# This is a very simple smoke test to check that the module can
# give us a 200 response to requests. We're not testing that
# the content is correct or even renders!

urls = [
"/",
"/metrics"
]

failures = []

for url in urls:
r = requests.get(base_uri + url, allow_redirects=False)
if r.status_code != 200:
failures.append(url)

log.info("{0}: {1} ({2} bytes)".format(
url, r.status_code, len(r.content)
))

self.assertListEqual(failures, [])
81 changes: 70 additions & 11 deletions src/pybind/mgr/prometheus/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import errno
import math
import os
import socket
from collections import OrderedDict
from mgr_module import MgrModule
from mgr_module import MgrModule, MgrStandbyModule

# Defaults for the Prometheus HTTP server. Can also set in config-key
# see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
Expand Down Expand Up @@ -137,8 +138,6 @@ class Module(MgrModule):

def __init__(self, *args, **kwargs):
super(Module, self).__init__(*args, **kwargs)
self.notified = False
self.serving = False
self.metrics = self._setup_static_metrics()
self.schema = OrderedDict()
_global_instance['plugin'] = self
Expand Down Expand Up @@ -230,10 +229,6 @@ def _setup_static_metrics(self):

return metrics

def shutdown(self):
self.serving = False
pass

def get_health(self):
health = json.loads(self.get('health')['json'])
self.metrics['health_status'].set(
Expand Down Expand Up @@ -389,10 +384,13 @@ def index(self):

@cherrypy.expose
def metrics(self):
metrics = global_instance().collect()
cherrypy.response.headers['Content-Type'] = 'text/plain'
if metrics:
return self.format_metrics(metrics)
if global_instance().have_mon_connection():
metrics = global_instance().collect()
cherrypy.response.headers['Content-Type'] = 'text/plain'
if metrics:
return self.format_metrics(metrics)
else:
raise cherrypy.HTTPError(503, 'No MON connection')

server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR)
server_port = self.get_localized_config('server_port', DEFAULT_PORT)
Expand All @@ -401,11 +399,72 @@ def metrics(self):
(server_addr, server_port)
)

# Publish the URI that others may use to access the service we're
# about to start serving
self.set_uri('http://{0}:{1}/'.format(
socket.getfqdn() if server_addr == '::' else server_addr,
server_port
))

cherrypy.config.update({
'server.socket_host': server_addr,
'server.socket_port': int(server_port),
'engine.autoreload.on': False
})
cherrypy.tree.mount(Root(), "/")
self.log.info('Starting engine...')
cherrypy.engine.start()
self.log.info('Engine started.')
cherrypy.engine.block()

def shutdown(self):
self.log.info('Stopping engine...')
cherrypy.engine.wait(state=cherrypy.engine.states.STARTED)
cherrypy.engine.exit()
self.log.info('Stopped engine')


class StandbyModule(MgrStandbyModule):
def serve(self):
server_addr = self.get_localized_config('server_addr', '::')
server_port = self.get_localized_config('server_port', DEFAULT_PORT)
self.log.info("server_addr: %s server_port: %s" % (server_addr, server_port))
cherrypy.config.update({
'server.socket_host': server_addr,
'server.socket_port': int(server_port),
'engine.autoreload.on': False
})

module = self

class Root(object):

@cherrypy.expose
def index(self):
active_uri = module.get_active_uri()
return '''<!DOCTYPE html>
<html>
<head><title>Ceph Exporter</title></head>
<body>
<h1>Ceph Exporter</h1>
<p><a href='{}metrics'>Metrics</a></p>
</body>
</html>'''.format(active_uri)

@cherrypy.expose
def metrics(self):
cherrypy.response.headers['Content-Type'] = 'text/plain'
return ''

cherrypy.tree.mount(Root(), '/', {})
self.log.info('Starting engine...')
cherrypy.engine.start()
self.log.info("Waiting for engine...")
cherrypy.engine.wait(state=cherrypy.engine.states.STOPPED)
self.log.info('Engine started.')

def shutdown(self):
self.log.info("Stopping engine...")
cherrypy.engine.wait(state=cherrypy.engine.states.STARTED)
cherrypy.engine.stop()
self.log.info("Stopped engine")