From 0e6fd420cde48d755b2a032d33405ef3917a30db Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 16 Apr 2026 05:15:09 +0200 Subject: [PATCH 1/2] Strip userinfo from ES host URL before using it as task-log label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Elasticsearch task-log handler grouped hits by host, falling back to the raw ``[elasticsearch] host`` config value when a hit lacked a ``host`` field. That config commonly embeds credentials (``https://user:password@elk.example.com:9200``), so the full URL — including the ``user:password@`` userinfo — would appear as a dictionary key in the task-log output, where any user with task-log read permission could see it. Add a ``_strip_userinfo`` helper and use it for the host fallback in ``_group_logs_by_host``. The Elasticsearch client is still connected using the full unredacted URL, so authentication is unaffected. Generated-by: Claude Opus 4.6 (1M context) following the guidelines at https://github.com/apache/airflow/blob/main/contributing-docs/05_pull_requests.rst#gen-ai-assisted-contributions --- providers/elasticsearch/docs/changelog.rst | 11 ++++++++ .../elasticsearch/log/es_task_handler.py | 25 ++++++++++++++++++- .../elasticsearch/log/test_es_task_handler.py | 16 ++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/providers/elasticsearch/docs/changelog.rst b/providers/elasticsearch/docs/changelog.rst index 2742767eb333c..7a629fe2029a1 100644 --- a/providers/elasticsearch/docs/changelog.rst +++ b/providers/elasticsearch/docs/changelog.rst @@ -27,6 +27,17 @@ Changelog --------- +Bug fixes +~~~~~~~~~ + +When the ``[elasticsearch] host`` config embeds credentials +(``https://user:password@elk.example.com:9200``), the log-source label +shown in task logs is now the host URL with the ``user:password@`` portion +stripped. Previously the full URL (including credentials) could appear as +a dictionary key in the task-log output when log-hits did not carry a +``host`` field. The Elasticsearch client is still connected using the +full URL, so authentication is unaffected. + 6.5.2 ..... diff --git a/providers/elasticsearch/src/airflow/providers/elasticsearch/log/es_task_handler.py b/providers/elasticsearch/src/airflow/providers/elasticsearch/log/es_task_handler.py index 3296f000f51ea..a8c676a8699c2 100644 --- a/providers/elasticsearch/src/airflow/providers/elasticsearch/log/es_task_handler.py +++ b/providers/elasticsearch/src/airflow/providers/elasticsearch/log/es_task_handler.py @@ -166,6 +166,28 @@ def getattr_nested(obj, item, default): return default +def _strip_userinfo(url: str) -> str: + """ + Return ``url`` with any ``user:password@`` userinfo removed. + + The Elasticsearch ``[elasticsearch] host`` config commonly embeds + credentials (``https://user:password@elk.example.com:9200``). This + value is reused as a display label for log-source grouping, so the + credentials would otherwise end up in task logs. Anything that is + not a valid URL is returned unchanged. + """ + try: + parsed = urlparse(url) + except (TypeError, ValueError): + return url + if not parsed.hostname or (not parsed.username and not parsed.password): + return url + netloc = parsed.hostname + if parsed.port is not None: + netloc = f"{netloc}:{parsed.port}" + return parsed._replace(netloc=netloc).geturl() + + def _render_log_id(log_id_template: str, ti: TaskInstance | TaskInstanceKey, try_number: int) -> str: return log_id_template.format( dag_id=ti.dag_id, @@ -801,8 +823,9 @@ def _get_index_patterns(self, ti: RuntimeTI | None) -> str: def _group_logs_by_host(self, response: ElasticSearchResponse) -> dict[str, list[Hit]]: grouped_logs = defaultdict(list) + host_fallback = _strip_userinfo(self.host) for hit in response: - key = getattr_nested(hit, self.host_field, None) or self.host + key = getattr_nested(hit, self.host_field, None) or host_fallback grouped_logs[key].append(hit) return grouped_logs diff --git a/providers/elasticsearch/tests/unit/elasticsearch/log/test_es_task_handler.py b/providers/elasticsearch/tests/unit/elasticsearch/log/test_es_task_handler.py index 0ab4a8e2ed4cf..90ed1773b3546 100644 --- a/providers/elasticsearch/tests/unit/elasticsearch/log/test_es_task_handler.py +++ b/providers/elasticsearch/tests/unit/elasticsearch/log/test_es_task_handler.py @@ -43,6 +43,7 @@ _clean_date, _format_error_detail, _render_log_id, + _strip_userinfo, get_es_kwargs_from_config, getattr_nested, ) @@ -228,6 +229,21 @@ def test_format_url(self, host, expected): else: assert ElasticsearchTaskHandler.format_url(host) == expected + @pytest.mark.parametrize( + ("host", "expected"), + [ + ("https://user:pass@elk.example.com:9200", "https://elk.example.com:9200"), + ("http://USER:PASS@elk.example.com", "http://elk.example.com"), + ("https://elk.example.com:9200", "https://elk.example.com:9200"), + ("http://localhost:9200", "http://localhost:9200"), + ("https://user@elk.example.com", "https://elk.example.com"), + ("not-a-url", "not-a-url"), + ("", ""), + ], + ) + def test_strip_userinfo(self, host, expected): + assert _strip_userinfo(host) == expected + def test_client(self): assert isinstance(self.es_task_handler.client, elasticsearch.Elasticsearch) assert self.es_task_handler.index_patterns == "_all" From 7962e234676d90f65b45576739e9e23f3900f1b0 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 16 Apr 2026 05:17:02 +0200 Subject: [PATCH 2/2] Apply suggestion from @potiuk --- providers/elasticsearch/docs/changelog.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/providers/elasticsearch/docs/changelog.rst b/providers/elasticsearch/docs/changelog.rst index 7a629fe2029a1..3adf13e4f175d 100644 --- a/providers/elasticsearch/docs/changelog.rst +++ b/providers/elasticsearch/docs/changelog.rst @@ -27,9 +27,6 @@ Changelog --------- -Bug fixes -~~~~~~~~~ - When the ``[elasticsearch] host`` config embeds credentials (``https://user:password@elk.example.com:9200``), the log-source label shown in task logs is now the host URL with the ``user:password@`` portion