-
Notifications
You must be signed in to change notification settings - Fork 17.1k
Fix TI failure handling when task cannot be unmapped. #23119
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
810ccc0
Fix TI failure handling when task cannot be unmapped.
ashb 0c31aff
fixup! Fix TI failure handling when task cannot be unmapped.
ashb 1a523ae
Update airflow/models/taskinstance.py
ashb f59a16b
fixup! Fix TI failure handling when task cannot be unmapped.
ashb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ | |
| import hashlib | ||
| import logging | ||
| import math | ||
| import operator | ||
| import os | ||
| import pickle | ||
| import signal | ||
|
|
@@ -133,6 +134,7 @@ | |
|
|
||
|
|
||
| if TYPE_CHECKING: | ||
| from airflow.models.baseoperator import BaseOperator | ||
| from airflow.models.dag import DAG, DagModel | ||
| from airflow.models.dagrun import DagRun | ||
| from airflow.models.operator import Operator | ||
|
|
@@ -1901,24 +1903,15 @@ def handle_failure( | |
| if not test_mode: | ||
| self.refresh_from_db(session) | ||
|
|
||
| task = self.task.unmap() | ||
| self.end_date = timezone.utcnow() | ||
| self.set_duration() | ||
| Stats.incr(f'operator_failures_{task.task_type}', 1, 1) | ||
| Stats.incr(f'operator_failures_{self.task.task_type}') | ||
|
Comment on lines
-1907
to
+1908
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this change? |
||
| Stats.incr('ti_failures') | ||
| if not test_mode: | ||
| session.add(Log(State.FAILED, self)) | ||
|
|
||
| # Log failure duration | ||
| session.add( | ||
| TaskFail( | ||
| task=task, | ||
| run_id=self.run_id, | ||
| start_date=self.start_date, | ||
| end_date=self.end_date, | ||
| map_index=self.map_index, | ||
| ) | ||
| ) | ||
| session.add(TaskFail(ti=self)) | ||
|
|
||
| self.clear_next_method_args() | ||
|
|
||
|
|
@@ -1934,20 +1927,26 @@ def handle_failure( | |
| # only mark task instance as FAILED if the next task instance | ||
| # try_number exceeds the max_tries ... or if force_fail is truthy | ||
|
|
||
| task = None | ||
| try: | ||
| task = self.task.unmap() | ||
| except Exception: | ||
| self.log.error("Unable to unmap task, can't determine if we need to send an alert email or not") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we log the traceback here with |
||
|
|
||
| if force_fail or not self.is_eligible_to_retry(): | ||
| self.state = State.FAILED | ||
| email_for_state = task.email_on_failure | ||
| email_for_state = operator.attrgetter('email_on_failure') | ||
| else: | ||
| if self.state == State.QUEUED: | ||
| # We increase the try_number so as to fail the task if it fails to start after sometime | ||
| self._try_number += 1 | ||
| self.state = State.UP_FOR_RETRY | ||
| email_for_state = task.email_on_retry | ||
| email_for_state = operator.attrgetter('email_on_retry') | ||
|
|
||
| self._log_state('Immediate failure requested. ' if force_fail else '') | ||
| if email_for_state and task.email: | ||
| if task and email_for_state(task) and task.email: | ||
| try: | ||
| self.email_alert(error) | ||
| self.email_alert(error, task) | ||
| except Exception: | ||
| self.log.exception('Failed to send email to: %s', task.email) | ||
|
|
||
|
|
@@ -2241,11 +2240,15 @@ def render_k8s_pod_yaml(self) -> Optional[dict]: | |
| sanitized_pod = ApiClient().sanitize_for_serialization(pod) | ||
| return sanitized_pod | ||
|
|
||
| def get_email_subject_content(self, exception: BaseException) -> Tuple[str, str, str]: | ||
| def get_email_subject_content( | ||
| self, exception: BaseException, task: Optional["BaseOperator"] = None | ||
| ) -> Tuple[str, str, str]: | ||
| """Get the email subject content for exceptions.""" | ||
| # For a ti from DB (without ti.task), return the default value | ||
| # Reuse it for smart sensor to send default email alert | ||
| use_default = not hasattr(self, 'task') | ||
| if task is None: | ||
| task = getattr(self, 'task') | ||
| use_default = task is None | ||
| exception_html = str(exception).replace('\n', '<br>') | ||
|
|
||
| default_subject = 'Airflow alert: {{ti}}' | ||
|
|
@@ -2312,13 +2315,14 @@ def render(key: str, content: str) -> str: | |
|
|
||
| return subject, html_content, html_content_err | ||
|
|
||
| def email_alert(self, exception): | ||
| def email_alert(self, exception, task: "BaseOperator"): | ||
| """Send alert email with exception information.""" | ||
| subject, html_content, html_content_err = self.get_email_subject_content(exception) | ||
| subject, html_content, html_content_err = self.get_email_subject_content(exception, task=task) | ||
| assert task.email | ||
| try: | ||
| send_email(self.task.email, subject, html_content) | ||
| send_email(task.email, subject, html_content) | ||
| except Exception: | ||
| send_email(self.task.email, subject, html_content_err) | ||
| send_email(task.email, subject, html_content_err) | ||
|
|
||
| def set_duration(self) -> None: | ||
| """Set TI duration""" | ||
|
|
@@ -2573,9 +2577,10 @@ def __init__( | |
| dag_id: str, | ||
| task_id: str, | ||
| run_id: str, | ||
| start_date: datetime, | ||
| end_date: datetime, | ||
| start_date: Optional[datetime], | ||
| end_date: Optional[datetime], | ||
| try_number: int, | ||
| map_index: int, | ||
| state: str, | ||
| executor_config: Any, | ||
| pool: str, | ||
|
|
@@ -2584,21 +2589,20 @@ def __init__( | |
| run_as_user: Optional[str] = None, | ||
| priority_weight: Optional[int] = None, | ||
| ): | ||
| self._dag_id: str = dag_id | ||
| self._task_id: str = task_id | ||
| self._run_id: str = run_id | ||
| self._start_date: datetime = start_date | ||
| self._end_date: datetime = end_date | ||
| self._try_number: int = try_number | ||
| self._state: str = state | ||
| self._executor_config: Any = executor_config | ||
| self._run_as_user: Optional[str] = None | ||
| self._run_as_user = run_as_user | ||
| self._pool: str = pool | ||
| self._priority_weight: Optional[int] = None | ||
| self._priority_weight = priority_weight | ||
| self._queue: str = queue | ||
| self._key = key | ||
| self.dag_id = dag_id | ||
| self.task_id = task_id | ||
| self.run_id = run_id | ||
| self.map_index = map_index | ||
| self.start_date = start_date | ||
| self.end_date = end_date | ||
| self.try_number = try_number | ||
| self.state = state | ||
| self.executor_config = executor_config | ||
| self.run_as_user = run_as_user | ||
| self.pool = pool | ||
| self.priority_weight = priority_weight | ||
| self.queue = queue | ||
| self.key = key | ||
|
|
||
| def __eq__(self, other): | ||
| if isinstance(other, self.__class__): | ||
|
|
@@ -2611,6 +2615,7 @@ def from_ti(cls, ti: TaskInstance): | |
| dag_id=ti.dag_id, | ||
| task_id=ti.task_id, | ||
| run_id=ti.run_id, | ||
| map_index=ti.map_index, | ||
| start_date=ti.start_date, | ||
| end_date=ti.end_date, | ||
| try_number=ti.try_number, | ||
|
|
@@ -2625,80 +2630,16 @@ def from_ti(cls, ti: TaskInstance): | |
|
|
||
| @classmethod | ||
| def from_dict(cls, obj_dict: dict): | ||
| ti_key = obj_dict.get('_key', []) | ||
| start_date: Union[Any, datetime] = ( | ||
| datetime.fromisoformat(str(obj_dict.get('_start_date'))) | ||
| if obj_dict.get('_start_date') is not None | ||
| else None | ||
| ) | ||
| end_date: Union[Any, datetime] = ( | ||
| datetime.fromisoformat(str(obj_dict.get('_end_date'))) | ||
| if obj_dict.get('_end_date') is not None | ||
| else None | ||
| ) | ||
| return cls( | ||
| dag_id=str(obj_dict['_dag_id']), | ||
| task_id=str(obj_dict.get('_task_id')), | ||
| run_id=str(obj_dict.get('_run_id')), | ||
| start_date=start_date, | ||
| end_date=end_date, | ||
| try_number=obj_dict.get('_try_number', 1), | ||
| state=str(obj_dict.get('_state')), | ||
| executor_config=obj_dict.get('_executor_config'), | ||
| run_as_user=obj_dict.get('_run_as_user', None), | ||
| pool=str(obj_dict.get('_pool')), | ||
| priority_weight=obj_dict.get('_priority_weight', None), | ||
| queue=str(obj_dict.get('_queue')), | ||
| key=TaskInstanceKey(ti_key[0], ti_key[1], ti_key[2], ti_key[3], ti_key[4]), | ||
| ) | ||
|
|
||
| @property | ||
| def dag_id(self) -> str: | ||
| return self._dag_id | ||
|
|
||
| @property | ||
| def task_id(self) -> str: | ||
| return self._task_id | ||
|
|
||
| @property | ||
| def run_id(self) -> str: | ||
| return self._run_id | ||
|
|
||
| @property | ||
| def start_date(self) -> datetime: | ||
| return self._start_date | ||
|
|
||
| @property | ||
| def end_date(self) -> datetime: | ||
| return self._end_date | ||
|
|
||
| @property | ||
| def try_number(self) -> int: | ||
| return self._try_number | ||
|
|
||
| @property | ||
| def state(self) -> str: | ||
| return self._state | ||
|
|
||
| @property | ||
| def pool(self) -> str: | ||
| return self._pool | ||
|
|
||
| @property | ||
| def priority_weight(self) -> Optional[int]: | ||
| return self._priority_weight | ||
|
|
||
| @property | ||
| def queue(self) -> str: | ||
| return self._queue | ||
|
|
||
| @property | ||
| def key(self) -> TaskInstanceKey: | ||
| return self._key | ||
|
|
||
| @property | ||
| def executor_config(self): | ||
| return self._executor_config | ||
| ti_key = TaskInstanceKey(*obj_dict.pop('key')) | ||
| start_date = None | ||
| end_date = None | ||
| start_date_str: Optional[str] = obj_dict.pop('start_date') | ||
| end_date_str: Optional[str] = obj_dict.pop('end_date') | ||
| if start_date_str: | ||
| start_date = timezone.parse(start_date_str) | ||
| if end_date_str: | ||
| end_date = timezone.parse(end_date_str) | ||
| return cls(**obj_dict, start_date=start_date, end_date=end_date, key=ti_key) | ||
|
|
||
|
|
||
| STATICA_HACK = True | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we should revive #19242, it can make the TI <-> SimpleTI conversions more future-proof.