Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions providers/cncf/kubernetes/docs/operators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,29 @@ It means that user can use all parameters from :class:`~airflow.providers.cncf.k

More information about the Jobs here: `Kubernetes Job Documentation <https://kubernetes.io/docs/concepts/workloads/controllers/job/>`__

Pod cleanup and ``on_finish_action``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When ``wait_until_job_complete=True``, the operator discovers Job pods via
``get_pods()`` and streams logs/XCom from those pods while the Job runs.

The inherited ``on_finish_action`` parameter controls what happens to these
discovered pods at the end of the task:

* ``delete_pod`` (default) — the pod is deleted after the task
finishes (success or failure).
* ``delete_succeeded_pod`` — the pod is deleted only when the task
succeeded.
* ``delete_active_pod`` — the pod is deleted only if it is still
active (``Pending`` or ``Running``).
* ``keep_pod`` — the pod is kept (useful for offline log
inspection).

When the task is killed, ``on_kill`` deletes the Job (with foreground cascade).
For discovered pods, deletion is controlled by ``on_kill_action``:
``delete_pod`` attempts direct pod deletion and ``keep_pod`` skips it.



.. _howto/operator:KubernetesDeleteJobOperator:

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Fix pod cleanup gaps in ``KubernetesJobOperator``.
``execute()`` and ``execute_complete()`` now consistently clean up pods discovered via ``get_pods()``,
including deferrable resume paths where pod lookup can fail.
The inherited ``on_finish_action`` parameter (``delete_pod`` / ``delete_succeeded_pod`` /
``delete_active_pod`` / ``keep_pod``) is honored for these pods, matching
``KubernetesPodOperator`` behavior.
In ``on_kill()``, pod cleanup now respects ``on_kill_action`` (``delete_pod`` deletes discovered pods;
``keep_pod`` skips pod deletion).
Per-pod cleanup errors are logged but never mask a Job-level failure.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import json
import logging
import os
import sys
import warnings
from collections.abc import Sequence
from functools import cached_property
Expand All @@ -41,9 +42,9 @@
from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator, merge_objects
from airflow.providers.cncf.kubernetes.triggers.job import KubernetesJobTrigger
from airflow.providers.cncf.kubernetes.utils.pod_manager import EMPTY_XCOM_RESULT, PodNotFoundException
from airflow.providers.cncf.kubernetes.utils.pod_manager import EMPTY_XCOM_RESULT, OnKillAction
from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_1_PLUS
from airflow.providers.common.compat.sdk import AirflowException, conf
from airflow.providers.common.compat.sdk import AirflowException, TaskDeferred, conf
from airflow.utils import yaml

if AIRFLOW_V_3_1_PLUS:
Expand Down Expand Up @@ -218,44 +219,49 @@ def execute(self, context: Context):
ti.xcom_push(key="job_name", value=self.job.metadata.name)
ti.xcom_push(key="job_namespace", value=self.job.metadata.namespace)

if self.wait_until_job_complete:
self.pods: Sequence[k8s.V1Pod] = self.get_pods(
pod_request_obj=self.pod_request_obj, context=context
)
try:
if self.wait_until_job_complete:
self.pods: Sequence[k8s.V1Pod] = self.get_pods(
pod_request_obj=self.pod_request_obj, context=context
)

if self.deferrable:
self.execute_deferrable()
return
if self.deferrable:
self.execute_deferrable()
# execute_deferrable raises TaskDeferred; cleanup is handled
# by execute_complete on resume.
return

if self.do_xcom_push:
xcom_result = []
for pod in self.pods:
self.pod_manager.await_container_completion(
pod=pod, container_name=self.base_container_name
)
self.pod_manager.await_xcom_sidecar_container_start(pod=pod)
xcom_result.append(self.extract_xcom(pod=pod))
self.job = self.hook.wait_until_job_complete(
job_name=self.job.metadata.name,
namespace=self.job.metadata.namespace,
job_poll_interval=self.job_poll_interval,
)
if self.get_logs:
for pod in self.pods:
self.pod_manager.fetch_requested_container_logs(
pod=pod,
containers=self.container_logs,
follow_logs=True,
)

if self.do_xcom_push:
xcom_result = []
for pod in self.pods:
self.pod_manager.await_container_completion(
pod=pod, container_name=self.base_container_name
)
self.pod_manager.await_xcom_sidecar_container_start(pod=pod)
xcom_result.append(self.extract_xcom(pod=pod))
self.job = self.hook.wait_until_job_complete(
job_name=self.job.metadata.name,
namespace=self.job.metadata.namespace,
job_poll_interval=self.job_poll_interval,
)
if self.get_logs:
for pod in self.pods:
self.pod_manager.fetch_requested_container_logs(
pod=pod,
containers=self.container_logs,
follow_logs=True,
ti.xcom_push(key="job", value=self.job.to_dict())
if self.wait_until_job_complete:
if error_message := self.hook.is_job_failed(job=self.job):
raise AirflowException(
f"Kubernetes job '{self.job.metadata.name}' is failed with error '{error_message}'"
)

ti.xcom_push(key="job", value=self.job.to_dict())
if self.wait_until_job_complete:
if error_message := self.hook.is_job_failed(job=self.job):
raise AirflowException(
f"Kubernetes job '{self.job.metadata.name}' is failed with error '{error_message}'"
)
if self.do_xcom_push:
return xcom_result[0] if self.unwrap_single and len(xcom_result) == 1 else xcom_result
if self.do_xcom_push:
return xcom_result[0] if self.unwrap_single and len(xcom_result) == 1 else xcom_result
finally:
self._cleanup_monitoring_pods(context)

def execute_deferrable(self):
self.defer(
Expand All @@ -277,39 +283,85 @@ def execute_deferrable(self):
)

def execute_complete(self, context: Context, event: dict, **kwargs):
ti = context["ti"]
ti.xcom_push(key="job", value=event["job"])
if event["status"] == "error":
raise AirflowException(event["message"])

if self.get_logs:
for pod_name in event["pod_names"]:
pod_namespace = event["pod_namespace"]
try:
pod = self.hook.get_pod(pod_name, pod_namespace)
except ApiException as e:
if e.status == 404:
# Resolve monitoring pods up front so the log-retrieval path and the
# cleanup path in the finally block share the same lookup (no double
# ``hook.get_pod`` calls).
pods_by_name: dict[str, k8s.V1Pod] = {}
event_job = event.get("job")
job_namespace = (
event_job.get("metadata", {}).get("namespace")
if isinstance(event_job, dict)
else None
)
pod_namespace = event.get("pod_namespace") or event.get("namespace") or job_namespace
unresolved_pods: list[tuple[str, str]] = []
for pod_name in event.get("pod_names") or []:
if not pod_namespace:
self.log.warning(
"Skipping pod %s lookup because no pod namespace was provided in trigger event.",
pod_name,
)
continue
try:
pod = self.hook.get_pod(pod_name, pod_namespace)
except ApiException as e:
if e.status == 404:
self.log.warning(
"Pod %s in namespace %s not found (possibly deleted).",
pod_name,
pod_namespace,
)
else:
self.log.warning(
"Failed to retrieve pod %s in namespace %s: %s. Skipping.",
pod_name,
pod_namespace,
e,
)
unresolved_pods.append((pod_name, pod_namespace))
continue
except Exception as e:
self.log.warning(
"Failed to retrieve pod %s in namespace %s: %s. Skipping.",
pod_name,
pod_namespace,
e,
)
unresolved_pods.append((pod_name, pod_namespace))
continue
if pod is not None:
pods_by_name[pod_name] = pod

try:
ti = context["ti"]
ti.xcom_push(key="job", value=event["job"])
if event["status"] == "error":
raise AirflowException(event["message"])

if self.get_logs:
for pod_name in event.get("pod_names") or []:
if pod_name not in pods_by_name:
# Pod was reported by the trigger but missing now (e.g. 404)
self.log.warning(
"Pod %s in namespace %s not found (possibly deleted). Skipping log retrieval.",
pod_name,
pod_namespace,
"Skipping log retrieval for pod %s (not found).", pod_name
)
continue
raise
if not pod:
raise PodNotFoundException("Could not find pod after resuming from deferral")
self._write_logs(pod)

if self.do_xcom_push:
xcom_results: list[Any | None] = []
for xcom_result in event["xcom_result"]:
if isinstance(xcom_result, str) and xcom_result.rstrip() == EMPTY_XCOM_RESULT:
self.log.info("xcom result file is empty.")
xcom_results.append(None)
continue
self.log.info("xcom result: \n%s", xcom_result)
xcom_results.append(json.loads(xcom_result))
return xcom_results[0] if self.unwrap_single and len(xcom_results) == 1 else xcom_results
self._write_logs(pods_by_name[pod_name])

if self.do_xcom_push:
xcom_results: list[Any | None] = []
for xcom_result in event["xcom_result"]:
if isinstance(xcom_result, str) and xcom_result.rstrip() == EMPTY_XCOM_RESULT:
self.log.info("xcom result file is empty.")
xcom_results.append(None)
continue
self.log.info("xcom result: \n%s", xcom_result)
xcom_results.append(json.loads(xcom_result))
return xcom_results[0] if self.unwrap_single and len(xcom_results) == 1 else xcom_results
finally:
self._cleanup_monitoring_pods_from_dict(
context, pods_by_name, unresolved_pods=unresolved_pods, event_status=event.get("status")
)

@staticmethod
def deserialize_job_template_file(path: str) -> k8s.V1Job:
Expand All @@ -334,6 +386,7 @@ def deserialize_job_template_file(path: str) -> k8s.V1Job:
return api_client._ApiClient__deserialize_model(job, k8s.V1Job)

def on_kill(self) -> None:
self._killed = True
if self.job:
job = self.job
kwargs = {
Expand All @@ -344,6 +397,95 @@ def on_kill(self) -> None:
if self.termination_grace_period is not None:
kwargs.update(grace_period_seconds=self.termination_grace_period)
self.job_client.delete_namespaced_job(**kwargs)
if self.on_kill_action == OnKillAction.KEEP_POD:
self.log.info(
"Skipping monitoring pod deletion since on_kill_action is set to %r.",
self.on_kill_action.value,
)
return
# Monitoring pods discovered via get_pods() have no ownerReferences and
# are not reaped by the Job's foreground cascade. Delete them directly.
for pod in getattr(self, "pods", None) or []:
try:
self.pod_manager.delete_pod(pod)
except ApiException:
self.log.exception(
"Unable to delete monitoring pod %s",
getattr(pod.metadata, "name", "<unknown>"),
)

def _cleanup_monitoring_pods(self, context: Context) -> None:
"""Run ``post_complete_action`` on each monitoring pod from ``self.pods``.

Honours ``on_finish_action`` (inherited from ``KubernetesPodOperator``)
and runs as a side-effect: any per-pod cleanup error is logged but never
masks the in-flight exception (e.g. an ``AirflowException`` raised because
the Job itself failed).
"""
# Skip cleanup when control is leaving execute() via TaskDeferred: the
# deferred trigger still needs the monitoring pods to exist; the pods
# will be cleaned up by execute_complete() on resume.
exc = sys.exc_info()[1]
if isinstance(exc, TaskDeferred):
return
for pod in getattr(self, "pods", None) or []:
remote_pod = pod
try:
pod_name = getattr(pod.metadata, "name", None)
pod_namespace = getattr(pod.metadata, "namespace", None)
if pod_name and pod_namespace:
remote_pod = self.hook.get_pod(name=pod_name, namespace=pod_namespace) or pod
except Exception:
remote_pod = pod
try:
self.post_complete_action(
pod=pod,
remote_pod=remote_pod,
context=context,
result=None,
)
except Exception:
# cleanup() can raise AirflowException for failed pods, and the
# k8s client can raise transport errors. For the Job operator we
# prefer the Job-level failure (or the original exception) to
# propagate instead of any per-pod cleanup error.
self.log.warning(
"Error while cleaning up monitoring pod %s",
getattr(pod.metadata, "name", "<unknown>"),
exc_info=True,
)

def _cleanup_monitoring_pods_from_dict(
self,
context: Context,
pods_by_name: dict[str, k8s.V1Pod],
*,
unresolved_pods: list[tuple[str, str]] | None = None,
event_status: str | None = None,
) -> None:
"""Run ``post_complete_action`` on each pod previously resolved via the trigger event.

Same semantics as :meth:`_cleanup_monitoring_pods` - errors are logged
but never mask the in-flight exception.
"""
for pod_name, pod in pods_by_name.items():
try:
self.post_complete_action(
pod=pod, remote_pod=pod, context=context, result=None
)
except Exception:
self.log.warning(
"Error while cleaning up monitoring pod %s",
pod_name,
exc_info=True,
)
pod_phase = "Succeeded" if event_status == "success" else "Failed" if event_status == "error" else None
for pod_name, pod_namespace in unresolved_pods or []:
fallback_pod = k8s.V1Pod(
metadata=k8s.V1ObjectMeta(name=pod_name, namespace=pod_namespace),
status=k8s.V1PodStatus(phase=pod_phase),
)
self.process_pod_deletion(fallback_pod, reraise=False)

def build_job_request_obj(self, context: Context | None = None) -> k8s.V1Job:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ async def run(self) -> AsyncIterator[TriggerEvent]:
{
"name": job.metadata.name,
"namespace": job.metadata.namespace,
"pod_names": [pod_name for pod_name in self.pod_names] if self.get_logs else None,
"pod_namespace": self.pod_namespace if self.get_logs else None,
"pod_names": list(self.pod_names),
"pod_namespace": self.pod_namespace,
"status": "error" if error_message else "success",
"message": f"Job failed with error: {error_message}"
if error_message
Expand Down
Loading
Loading