Skip to content

Commit

Permalink
Centralized logging via otel
Browse files Browse the repository at this point in the history
  • Loading branch information
chrismeyersfsu committed May 16, 2024
1 parent 4d641b6 commit 63fe52a
Show file tree
Hide file tree
Showing 11 changed files with 270 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ VAULT ?= false
VAULT_TLS ?= false
# If set to true docker-compose will also start a tacacs+ instance
TACACS ?= false
# If set to true docker-compose will also start an OpenTelemetry Collector instance
OTEL ?= false
# If set to true docker-compose will also start a Loki instance
LOKI ?= false
# If set to true docker-compose will install editable dependencies
EDITABLE_DEPENDENCIES ?= false

Expand Down Expand Up @@ -535,6 +539,8 @@ docker-compose-sources: .git/hooks/pre-commit
-e enable_vault=$(VAULT) \
-e vault_tls=$(VAULT_TLS) \
-e enable_tacacs=$(TACACS) \
-e enable_otel=$(OTEL) \
-e enable_loki=$(LOKI) \
-e install_editable_dependencies=$(EDITABLE_DEPENDENCIES) \
$(EXTRA_SOURCES_ANSIBLE_OPTS)

Expand Down
2 changes: 2 additions & 0 deletions awx/main/utils/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
)
import logging
from logging import Filter
from dateutil.tz import tzutc
from datetime import datetime

from django.apps import apps
from django.db import models
Expand Down
47 changes: 47 additions & 0 deletions awx/main/utils/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
# All Rights Reserved.

# Python
import base64
import logging
import sys
import traceback
import os
from datetime import datetime

# Django
Expand All @@ -15,6 +17,15 @@
# AWX
from awx.main.exceptions import PostRunError

# OTEL
from opentelemetry._logs import set_logger_provider
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter as OTLPGrpcLogExporter
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter as OTLPHttpLogExporter

from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.resources import Resource


class RSysLogHandler(logging.handlers.SysLogHandler):
append_nul = False
Expand Down Expand Up @@ -133,3 +144,39 @@ def format(self, record):
pass
else:
ColorHandler = logging.StreamHandler


class OTLPHandler(LoggingHandler):
def __init__(self, endpoint=None, protocol='grpc', service_name=None, instance_id=None, auth=None, username=None, password=None):
if not endpoint:
raise ValueError("endpoint required")

if auth == 'basic' and (username is None or password is None):
raise ValueError("auth type basic requires username and passsword parameters")

self.endpoint = endpoint
self.service_name = service_name or (sys.argv[1] if len(sys.argv) > 1 else (sys.argv[0] or 'unknown_service'))
self.instance_id = instance_id or os.uname().nodename

logger_provider = LoggerProvider(
resource=Resource.create(
{
"service.name": self.service_name,
"service.instance.id": self.instance_id,
}
),
)
set_logger_provider(logger_provider)

headers = {}
if auth == 'basic':
secret = f'{username}:{password}'
headers['Authorization'] = "Basic " + base64.b64encode(secret.encode()).decode()

if protocol == 'grpc':
otlp_exporter = OTLPGrpcLogExporter(endpoint=self.endpoint, insecure=True, headers=headers)
elif protocol == 'http':
otlp_exporter = OTLPHttpLogExporter(endpoint=self.endpoint, headers=headers)
logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter))

super().__init__(level=logging.NOTSET, logger_provider=logger_provider)
1 change: 1 addition & 0 deletions awx/settings/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,7 @@
'address': '/var/run/awx-rsyslog/rsyslog.sock',
'filters': ['external_log_enabled', 'dynamic_level_filter', 'guid'],
},
'otel': {'class': 'logging.NullHandler'},
},
'loggers': {
'django': {'handlers': ['console']},
Expand Down
6 changes: 6 additions & 0 deletions requirements/requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ pip>=21.3 # PEP 660 – Editable installs for pyproject.toml based builds (wheel
debugpy
remote-pdb
sdb

# OTEL
opentelemetry-api==1.24.0
opentelemetry-sdk==1.24.0
opentelemetry-instrumentation-logging
opentelemetry-exporter-otlp
10 changes: 10 additions & 0 deletions tools/docker-compose/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -613,3 +613,13 @@ docker exec -it -e VAULT_TOKEN=<token> tools_vault_1 vault kv get --address=http
### Prometheus and Grafana integration

See docs at https://github.com/ansible/awx/blob/devel/tools/grafana/README.md

### OpenTelemetry Integration

```bash
OTEL=true GRAFANA=true LOKI=true PROMETHEUS=true make docker-compose
```

This will start the sidecar container `tools_otel_1` and configure AWX logging to send to it. The OpenTelemetry Collector is configured to export logs to Loki. Grafana is configured with Loki as a datasource. AWX logs can be viewed in Grafana.

`http://localhost:3001` grafana
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,42 @@ services:
# pg_notify will NOT work in transaction mode.
PGBOUNCER_POOL_MODE: session
{% endif %}
{% if enable_otel|bool %}
otel:
image: otel/opentelemetry-collector-contrib:0.88.0
container_name: tools_otel_1
hostname: otel
command: ["--config=/etc/otel-collector-config.yaml", ""]
networks:
- awx
ports:
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP http receiver
- "55679:55679" # zpages http://localhost:55679/debug/servicez /tracez
volumes:
- "../../otel/otel-collector-config.yaml:/etc/otel-collector-config.yaml"
depends_on:
- loki
{% endif %}
{% if enable_loki|bool %}
loki:
image: grafana/loki:2.9.5
container_name: tools_loki_1
hostname: loki
ports:
- "3100:3100"
command: -config.file=/etc/loki/local-config.yaml
networks:
- awx
volumes:
- "loki_storage:/loki:rw"
#- "../../docker-compose/loki/volumes/index:/loki/index"
#- "../../docker-compose/loki/volumes/boltdb-cache:/loki/boltdb-cache"
- "../../loki/local-config.yaml:/etc/loki/local-config.yaml"
depends_on:
- grafana
{% endif %}

{% if execution_node_count|int > 0 %}
receptor-hop:
image: {{ receptor_image }}
Expand Down Expand Up @@ -360,6 +396,10 @@ volumes:
grafana_storage:
name: tools_grafana_storage
{% endif %}
{% if enable_loki|bool %}
loki_storage:
name: tools_loki_storage
{% endif %}

networks:
awx:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,18 @@ OPTIONAL_API_URLPATTERN_PREFIX = '{{ api_urlpattern_prefix }}'
# LOGGING['loggers']['django_auth_ldap']['handlers'] = ['console']
# LOGGING['loggers']['django_auth_ldap']['level'] = 'DEBUG'

{% if enable_otel|bool %}
LOGGING['handlers']['otel'] |= {
'class': 'awx.main.utils.handlers.OTLPHandler',
'endpoint': 'http://otel:4317',
}
# Add otel log handler to all log handlers
for name in LOGGING['loggers'].keys():
handler = LOGGING['loggers'][name].get('handlers', [])
if 'otel' not in handler:
LOGGING['loggers'][name].get('handlers', []).append('otel')
{% endif %}

BROADCAST_WEBSOCKET_PORT = 8013
BROADCAST_WEBSOCKET_VERIFY_CERT = False
BROADCAST_WEBSOCKET_PROTOCOL = 'http'
Expand Down
11 changes: 11 additions & 0 deletions tools/grafana/datasources/loki_source.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
apiVersion: 1

datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
jsonData:
timeout: 60
maxLines: 100000
96 changes: 96 additions & 0 deletions tools/loki/local-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
auth_enabled: false

server:
http_listen_port: 3100
grpc_server_max_recv_msg_size: 524288000 # 500 MB
grpc_server_max_send_msg_size: 524288000 # 500 MB, might be too much, be careful

frontend_worker:
match_max_concurrent: true
grpc_client_config:
max_send_msg_size: 524288000 # 500 MB


ingester:
max_chunk_age: 8766h

common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory

# compactor:
# retention_enabled: true
# # cmeyers: YOLO. 1s seems wrong but it works so right
# compaction_interval: 1s # default 10m

schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h

storage_config:
boltdb_shipper:
active_index_directory: /loki/index
cache_location: /loki/boltdb-cache

ruler:
alertmanager_url: http://localhost:9093

limits_config:
retention_period: 3y
# cmeyers: The default of 30m triggers a loop of queries that take a long time
# to complete and the UI times out
split_queries_by_interval: 1d
# cmeyers: Default of 30d1h limits grafana time queries. Can't, for example,
# query last 90 days
max_query_length: 3y
# cmeyers: Made the batch post request succeed.
reject_old_samples: false
reject_old_samples_max_age: 365d

ingestion_rate_mb: 32
ingestion_burst_size_mb: 32
per_stream_rate_limit: 32M
per_stream_rate_limit_burst: 32M
ingestion_rate_strategy: local # Default: global
max_global_streams_per_user: 100000000
max_entries_limit_per_query: 100000000
max_query_series: 1000000
max_query_parallelism: 32 # Old Default: 14
max_streams_per_user: 100000000 # Old Default: 10000

# Taken from aap-log-visualizer
frontend:
max_outstanding_per_tenant: 2048

query_scheduler:
max_outstanding_requests_per_tenant: 2048

query_range:
parallelise_shardable_queries: false
split_queries_by_interval: 0

# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
#
# Statistics help us better understand how Loki is used, and they show us performance
# levels for most users. This helps us prioritize features and documentation.
# For more information on what's sent, look at
# https://github.com/grafana/loki/blob/main/pkg/usagestats/stats.go
# Refer to the buildReport method to see what goes into a report.
#
# If you would like to disable reporting, uncomment the following lines:
#analytics:
# reporting_enabled: false
39 changes: 39 additions & 0 deletions tools/otel/otel-collector-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
receivers:
otlp:
protocols:
grpc:

exporters:
debug:
verbosity: detailed

loki:
endpoint: http://loki:3100/loki/api/v1/push
tls:
insecure: true
headers:
"X-Scope-OrgID": "1"
default_labels_enabled:
exporter: true
job: true
instance: true
level: true

processors:
batch:

extensions:
health_check:
zpages:
endpoint: ":55679"

service:
pipelines:
logs:
receivers: [otlp]
processors: [batch]
exporters: [loki]

extensions:
- health_check
- zpages

0 comments on commit 63fe52a

Please sign in to comment.