Skip to content

Commit

Permalink
feat: add log-cleanup sidecar to scheduler/worker (#554)
Browse files Browse the repository at this point in the history
Signed-off-by: Mathew Wicks <thesuperzapper@users.noreply.github.com>
  • Loading branch information
thesuperzapper authored Mar 31, 2022
1 parent 1bc1852 commit 4fbb856
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 9 deletions.
28 changes: 28 additions & 0 deletions charts/airflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,12 @@ workers:
## how many seconds (after the 9min) to wait before SIGKILL
terminationPeriod: 60
logCleanup:
resources:
requests:
## IMPORTANT! for autoscaling to work with logCleanup
memory: "64Mi"
dags:
gitSync:
resources:
Expand Down Expand Up @@ -931,6 +937,16 @@ airflow:
## this does NOT give root permissions to Pods, only the "root" group
fsGroup: 0
scheduler:
logCleanup:
## scheduler log-cleanup must be disabled if `logs.persistence.enabled` is `true`
enabled: false

workers:
logCleanup:
## workers log-cleanup must be disabled if `logs.persistence.enabled` is `true`
enabled: false

logs:
persistence:
enabled: true
Expand All @@ -954,6 +970,16 @@ airflow:
## this does NOT give root permissions to Pods, only the "root" group
fsGroup: 0
scheduler:
logCleanup:
## scheduler log-cleanup must be disabled if `logs.persistence.enabled` is `true`
enabled: false

workers:
logCleanup:
## workers log-cleanup must be disabled if `logs.persistence.enabled` is `true`
enabled: false

logs:
persistence:
enabled: true
Expand Down Expand Up @@ -1486,6 +1512,7 @@ Parameter | Description | Default
`scheduler.podAnnotations` | Pod annotations for the scheduler Deployment | `{}`
`scheduler.safeToEvict` | if we add the annotation: "cluster-autoscaler.kubernetes.io/safe-to-evict" = "true" | `true`
`scheduler.podDisruptionBudget.*` | configs for the PodDisruptionBudget of the scheduler | `<see values.yaml>`
`scheduler.logCleanup.*` | configs for the log-cleanup sidecar of the scheduler | `<see values.yaml>`
`scheduler.numRuns` | the value of the `airflow --num_runs` parameter used to run the airflow scheduler | `-1`
`scheduler.extraPipPackages` | extra pip packages to install in the scheduler Pods | `[]`
`scheduler.extraVolumeMounts` | extra VolumeMounts for the scheduler Pods | `[]`
Expand Down Expand Up @@ -1549,6 +1576,7 @@ Parameter | Description | Default
`workers.autoscaling.*` | configs for the HorizontalPodAutoscaler of the worker Pods | `<see values.yaml>`
`workers.celery.*` | configs for the celery worker Pods | `<see values.yaml>`
`workers.terminationPeriod` | how many seconds to wait after SIGTERM before SIGKILL of the celery worker | `60`
`workers.logCleanup.*` | configs for the log-cleanup sidecar of the worker Pods | `<see values.yaml>`
`workers.extraPipPackages` | extra pip packages to install in the worker Pods | `[]`
`workers.extraVolumeMounts` | extra VolumeMounts for the worker Pods | `[]`
`workers.extraVolumes` | extra Volumes for the worker Pods | `[]`
Expand Down
64 changes: 55 additions & 9 deletions charts/airflow/examples/google-gke/custom-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,27 @@ scheduler:
cpu: "1000m"
memory: "512Mi"

## configs for the log-cleanup sidecar of the scheduler
##
## NOTE:
## - helps prevent excessive log buildup by regularly deleting old files
##
logCleanup:
## if the log-cleanup sidecar is enabled
##
enabled: true

## resource requests/limits for the log-cleanup container
##
resources:
requests:
cpu: "10m"
memory: "32Mi"

## the number of minutes to retain log files (by last-modified time)
##
retentionMinutes: 21600

###################################
# Airflow - WebUI Configs
###################################
Expand Down Expand Up @@ -245,6 +266,30 @@ workers:
##
terminationPeriod: 60

## configs for the log-cleanup sidecar of the worker Pods
##
## NOTE:
## - helps prevent excessive log buildup by regularly deleting old files
##
logCleanup:
## if the log-cleanup sidecar is enabled
##
enabled: true

## resource requests/limits for the log-cleanup container
##
## WARNING:
## - you MUST SPECIFY a resource request for logCleanup if using `workers.autoscaling`
##
resources:
requests:
cpu: "10m"
memory: "32Mi"

## the number of minutes to retain log files (by last-modified time)
##
retentionMinutes: 21600

###################################
# Airflow - Flower Configs
###################################
Expand Down Expand Up @@ -291,6 +336,16 @@ dags:
##
enabled: true

## resource requests/limits for the git-sync container
##
## WARNING:
## - you MUST SPECIFY a resource request for gitSync if using `workers.autoscaling`
##
resources:
requests:
cpu: "50m"
memory: "64Mi"

## the url of the git repo
##
repo: "git@repo.example.com/my-airflow-dags.git"
Expand All @@ -315,15 +370,6 @@ dags:
##
sshSecretKey: id_rsa

## resource requests/limits for the git-sync container
##
## WARNING:
## - you MUST SPECIFY a resource request for gitSync if using `workers.autoscaling`
##
requests:
cpu: "50m"
memory: "64Mi"

###################################
# Kubernetes - RBAC
###################################
Expand Down
69 changes: 69 additions & 0 deletions charts/airflow/templates/_helpers/pods.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,69 @@ EXAMPLE USAGE: {{ include "airflow.container.git_sync" (dict "Release" .Release
{{- end }}
{{- end }}

{{/*
Define a container which regularly deletes airflow logs older than a retention period.
EXAMPLE USAGE: {{ include "airflow.container.log_cleanup" (dict "Release" .Release "Values" .Values "resources" $lc_resources "retention_min" $lc_retention_min "interval_sec" $lc_interval_sec) }}
*/}}
{{- define "airflow.container.log_cleanup" }}
- name: log-cleanup
{{- include "airflow.image" . | indent 2 }}
resources:
{{- toYaml .resources | nindent 4 }}
envFrom:
{{- include "airflow.envFrom" . | indent 4 }}
env:
- name: LOG_PATH
value: {{ .Values.logs.path | quote }}
- name: RETENTION_MINUTES
value: {{ .retention_min | quote }}
- name: INTERVAL_SECONDS
value: {{ .interval_sec | quote }}
{{- /* this has user-defined variables, so must be included BELOW (so the ABOVE `env` take precedence) */ -}}
{{- include "airflow.env" . | indent 4 }}
command:
{{- include "airflow.command" . | indent 4 }}
args:
- "bash"
- "-c"
- |
set -euo pipefail

# break the infinite loop when we receive SIGINT or SIGTERM
trap "exit 0" SIGINT SIGTERM

while true; do
START_EPOCH=$(date --utc +%s)
echo "[$(date --utc +%FT%T.%3N)] deleting log files older than $RETENTION_MINUTES minutes..."

# delete all writable files ending in ".log" with modified-time older than $RETENTION_MINUTES
# NOTE: `-printf "."` prints a "." for each deleted file, which we count the bytes of with `wc -c`
DELETED_COUNT=$(
find "$LOG_PATH" \
-type f \
-name "*.log" \
-mmin +"$RETENTION_MINUTES" \
-writable \
-delete \
-printf "." \
| wc -c
)

END_EPOCH=$(date --utc +%s)
LOOP_DURATION=$((END_EPOCH - START_EPOCH))
echo "[$(date --utc +%FT%T.%3N)] deleted $DELETED_COUNT files in $LOOP_DURATION seconds"

SECONDS_TO_SLEEP=$((INTERVAL_SECONDS - LOOP_DURATION))
if (( SECONDS_TO_SLEEP > 0 )); then
echo "[$(date --utc +%FT%T.%3N)] waiting $SECONDS_TO_SLEEP seconds..."
sleep $SECONDS_TO_SLEEP
fi
done
volumeMounts:
- name: logs-data
mountPath: {{ .Values.logs.path }}
{{- end }}

{{/*
The list of `volumeMounts` for web/scheduler/worker/flower container
EXAMPLE USAGE: {{ include "airflow.volumeMounts" (dict "Release" .Release "Values" .Values "extraPipPackages" $extraPipPackages "extraVolumeMounts" $extraVolumeMounts) }}
Expand Down Expand Up @@ -304,6 +367,9 @@ EXAMPLE USAGE: {{ include "airflow.volumeMounts" (dict "Release" .Release "Value
- name: logs-data
mountPath: {{ .Values.logs.path }}
subPath: {{ .Values.logs.persistence.subPath }}
{{- else }}
- name: logs-data
mountPath: {{ .Values.logs.path }}
{{- end }}

{{- /* pip-packages */ -}}
Expand Down Expand Up @@ -363,6 +429,9 @@ EXAMPLE USAGE: {{ include "airflow.volumes" (dict "Release" .Release "Values" .V
{{- else }}
claimName: {{ printf "%s-logs" (include "airflow.fullname" . | trunc 58) }}
{{- end }}
{{- else }}
- name: logs-data
emptyDir: {}
{{- end }}

{{- /* git-sync */ -}}
Expand Down
8 changes: 8 additions & 0 deletions charts/airflow/templates/_helpers/validate-values.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@
{{- if not (eq .Values.logs.persistence.accessMode "ReadWriteMany") }}
{{ required "The `logs.persistence.accessMode` must be `ReadWriteMany`!" nil }}
{{- end }}
{{- if .Values.scheduler.logCleanup.enabled }}
{{ required "If `logs.persistence.enabled=true`, then `scheduler.logCleanup.enabled` must be disabled!" nil }}
{{- end }}
{{- if .Values.workers.enabled }}
{{- if .Values.workers.logCleanup.enabled }}
{{ required "If `logs.persistence.enabled=true`, then `workers.logCleanup.enabled` must be disabled!" nil }}
{{- end }}
{{- end }}
{{- end }}

{{/* Checks for `dags.persistence` */}}
Expand Down
6 changes: 6 additions & 0 deletions charts/airflow/templates/scheduler/scheduler-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ spec:
{{- if .Values.dags.gitSync.enabled }}
{{- include "airflow.container.git_sync" . | indent 8 }}
{{- end }}
{{- if .Values.scheduler.logCleanup.enabled }}
{{- $lc_resources := .Values.scheduler.logCleanup.resources }}
{{- $lc_retention_min := .Values.scheduler.logCleanup.retentionMinutes }}
{{- $lc_interval_sec := .Values.scheduler.logCleanup.intervalSeconds }}
{{- include "airflow.container.log_cleanup" (dict "Release" .Release "Values" .Values "resources" $lc_resources "retention_min" $lc_retention_min "interval_sec" $lc_interval_sec) | indent 8 }}
{{- end }}
{{- if .Values.airflow.extraContainers }}
{{- toYaml .Values.airflow.extraContainers | nindent 8 }}
{{- end }}
Expand Down
6 changes: 6 additions & 0 deletions charts/airflow/templates/worker/worker-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ spec:
{{- if .Values.dags.gitSync.enabled }}
{{- include "airflow.container.git_sync" . | indent 8 }}
{{- end }}
{{- if .Values.workers.logCleanup.enabled }}
{{- $lc_resources := .Values.workers.logCleanup.resources }}
{{- $lc_retention_min := .Values.workers.logCleanup.retentionMinutes }}
{{- $lc_interval_sec := .Values.workers.logCleanup.intervalSeconds }}
{{- include "airflow.container.log_cleanup" (dict "Release" .Release "Values" .Values "resources" $lc_resources "retention_min" $lc_retention_min "interval_sec" $lc_interval_sec) | indent 8 }}
{{- end }}
{{- if .Values.airflow.extraContainers }}
{{- toYaml .Values.airflow.extraContainers | nindent 8 }}
{{- end }}
Expand Down
47 changes: 47 additions & 0 deletions charts/airflow/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,29 @@ scheduler:
##
minAvailable: ""

## configs for the log-cleanup sidecar of the scheduler
## - helps prevent excessive log buildup by regularly deleting old files
##
logCleanup:
## if the log-cleanup sidecar is enabled
## - [WARNING] must be disabled if `logs.persistence.enabled` is `true`
##
enabled: true

## resource requests/limits for the log-cleanup container
## - spec of ResourceRequirements:
## https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#resourcerequirements-v1-core
##
resources: {}

## the number of minutes to retain log files (by last-modified time)
##
retentionMinutes: 21600

## the number of seconds between each check for files to delete
##
intervalSeconds: 900

## sets `airflow --num_runs` parameter used to run the airflow scheduler
##
numRuns: -1
Expand Down Expand Up @@ -793,6 +816,7 @@ workers:

## configs for the HorizontalPodAutoscaler of the worker Pods
## - [WARNING] if using git-sync, ensure `dags.gitSync.resources` is set
## - [WARNING] if using worker log-cleanup, ensure `workers.logCleanup.resources` is set
##
## ____ EXAMPLE _______________
## autoscaling:
Expand Down Expand Up @@ -837,6 +861,29 @@ workers:
##
terminationPeriod: 60

## configs for the log-cleanup sidecar of the worker Pods
## - helps prevent excessive log buildup by regularly deleting old files
##
logCleanup:
## if the log-cleanup sidecar is enabled
## - [WARNING] must be disabled if `logs.persistence.enabled` is `true`
##
enabled: true

## resource requests/limits for the log-cleanup container
## - spec of ResourceRequirements:
## https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#resourcerequirements-v1-core
##
resources: {}

## the number of minutes to retain log files (by last-modified time)
##
retentionMinutes: 21600

## the number of seconds between each check for files to delete
##
intervalSeconds: 900

## extra pip packages to install in the worker Pod
##
## ____ EXAMPLE _______________
Expand Down

0 comments on commit 4fbb856

Please sign in to comment.