Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitize the conn_id to disallow potential script execution #32867

Merged
merged 33 commits into from Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
8b52ca7
sanitize the conn_id to disallow potential script execution
andylamp Jul 26, 2023
8da482d
Merge branch 'main' into main
andylamp Jul 26, 2023
6a4a01d
update docs and naming of the regex to better indicate its purpose
andylamp Jul 26, 2023
c0d2143
use re2 instead of re, as per static checks
andylamp Jul 26, 2023
2912fec
Update airflow/models/connection.py
andylamp Jul 27, 2023
a78c149
Merge branch 'main' into main
andylamp Jul 27, 2023
735b34b
refactor to return the first matched group, which is out desired outcome
andylamp Jul 27, 2023
44a7211
tweak the comments
andylamp Jul 27, 2023
54e7cd4
add a test suite for the regex sanitization
andylamp Jul 27, 2023
bfdcf86
fix the docs linter, change to en-US instead of en-GB
andylamp Jul 28, 2023
546afff
remove docstrings from test
andylamp Jul 28, 2023
96d72c1
Merge branch 'main' into main
andylamp Jul 31, 2023
a524a5e
update to fix integration tests
andylamp Aug 6, 2023
7882372
Merge branch 'main' into main
andylamp Aug 6, 2023
fa14397
update to remove logging that messes up tests
andylamp Aug 6, 2023
304836b
Merge remote-tracking branch 'origin/main'
andylamp Aug 6, 2023
9cf14cb
Merge branch 'main' into main
andylamp Aug 6, 2023
f9cd0b6
Merge branch 'main' into main
andylamp Aug 7, 2023
997e648
Update airflow/models/connection.py
andylamp Aug 7, 2023
b1ee2e1
remove commented code
andylamp Aug 8, 2023
c04caef
push regex update to handle aws ARN format
andylamp Aug 13, 2023
088f0de
make ruff happy
andylamp Aug 13, 2023
5491fe6
Merge branch 'main' into main
andylamp Aug 13, 2023
5408199
Merge remote-tracking branch 'origin/main'
andylamp Aug 13, 2023
6aa9612
add validator to form
andylamp Jan 12, 2024
0dc4134
Merge remote-tracking branch 'origin/main'
andylamp Jan 12, 2024
f8aa6d2
feat: add sanitization, proper messaging to FAB, fix errors/tests
andylamp Jan 15, 2024
68d9bdc
bugfix: typo in function docs
andylamp Jan 15, 2024
5ad35ca
bugfix: typo in function docs, add example for the regex
andylamp Jan 15, 2024
156f3fc
Merge branch 'main' into main
andylamp Jan 15, 2024
e1c126c
Merge branch 'main' into main
andylamp Jan 15, 2024
3feef94
Merge branch 'main' into main
andylamp Jan 15, 2024
90b3220
Merge branch 'main' into main
andylamp Jan 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 35 additions & 2 deletions airflow/models/connection.py
Expand Up @@ -24,6 +24,7 @@
from typing import Any
from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit

import re2
uranusjr marked this conversation as resolved.
Show resolved Hide resolved
from sqlalchemy import Boolean, Column, Integer, String, Text
from sqlalchemy.orm import declared_attr, reconstructor, synonym

Expand All @@ -38,6 +39,13 @@
from airflow.utils.module_loading import import_string

log = logging.getLogger(__name__)
# sanitize the `conn_id` pattern by allowing alphanumeric characters plus
# the symbols #,!,-,_,.,:,\,/ and () requiring at least one match.
#
# You can try the regex here: https://regex101.com/r/69033B/1
RE_SANITIZE_CONN_ID = re2.compile(r"^[\w\#\!\(\)\-\.\:\/\\]{1,}$")
# the conn ID max len should be 250
CONN_ID_MAX_LEN: int = 250


def parse_netloc_to_hostname(*args, **kwargs):
Expand All @@ -46,10 +54,35 @@ def parse_netloc_to_hostname(*args, **kwargs):
return _parse_netloc_to_hostname(*args, **kwargs)


def sanitize_conn_id(conn_id: str | None, max_length=CONN_ID_MAX_LEN) -> str | None:
r"""Sanitizes the connection id and allows only specific characters to be within.

Namely, it allows alphanumeric characters plus the symbols #,!,-,_,.,:,\,/ and () from 1 and up to
250 consecutive matches. If desired, the max length can be adjusted by setting `max_length`.

You can try to play with the regex here: https://regex101.com/r/69033B/1

The character selection is such that it prevents the injection of javascript or
executable bits to avoid any awkward behaviour in the front-end.

:param conn_id: The connection id to sanitize.
:param max_length: The max length of the connection ID, by default it is 250.
:return: the sanitized string, `None` otherwise.
andylamp marked this conversation as resolved.
Show resolved Hide resolved
andylamp marked this conversation as resolved.
Show resolved Hide resolved
"""
# check if `conn_id` or our match group is `None` and the `conn_id` is within the specified length.
if (not isinstance(conn_id, str) or len(conn_id) > max_length) or (
res := re2.match(RE_SANITIZE_CONN_ID, conn_id)
) is None:
return None

# if we reach here, then we matched something, return the first match
return res.group(0)


# Python automatically converts all letters to lowercase in hostname
# See: https://issues.apache.org/jira/browse/AIRFLOW-3615
def _parse_netloc_to_hostname(uri_parts):
"""Parse a URI string to get correct Hostname."""
"""Parse a URI string to get the correct Hostname."""
hostname = unquote(uri_parts.hostname or "")
if "/" in hostname:
hostname = uri_parts.netloc
Expand Down Expand Up @@ -115,7 +148,7 @@ def __init__(
uri: str | None = None,
):
super().__init__()
self.conn_id = conn_id
self.conn_id = sanitize_conn_id(conn_id)
self.description = description
if extra and not isinstance(extra, str):
extra = json.dumps(extra)
Expand Down
4 changes: 2 additions & 2 deletions airflow/www/forms.py
Expand Up @@ -41,7 +41,7 @@
from airflow.providers_manager import ProvidersManager
from airflow.utils import timezone
from airflow.utils.types import DagRunType
from airflow.www.validators import ReadOnly, ValidKey
from airflow.www.validators import ReadOnly, ValidConnID
from airflow.www.widgets import (
AirflowDateTimePickerROWidget,
AirflowDateTimePickerWidget,
Expand Down Expand Up @@ -221,7 +221,7 @@ def process(self, formdata=None, obj=None, **kwargs):

conn_id = StringField(
lazy_gettext("Connection Id"),
validators=[InputRequired(), ValidKey()],
validators=[InputRequired(), ValidConnID()],
widget=BS3TextFieldWidget(),
)
conn_type = SelectField(
Expand Down
28 changes: 27 additions & 1 deletion airflow/www/validators.py
Expand Up @@ -22,6 +22,7 @@

from wtforms.validators import EqualTo, ValidationError

from airflow.models.connection import CONN_ID_MAX_LEN, sanitize_conn_id
from airflow.utils import helpers


Expand Down Expand Up @@ -85,7 +86,7 @@ class ValidKey:
Validates values that will be used as keys.

:param max_length:
The maximum length of the given key
The maximum allowed length of the given key
"""

def __init__(self, max_length=200):
Expand All @@ -108,3 +109,28 @@ class ReadOnly:

def __call__(self, form, field):
field.flags.readonly = True


class ValidConnID:
"""
Validates the connection ID adheres to the desired format.

:param max_length:
The maximum allowed length of the given Connection ID.
"""

message = (
"Connection ID must be alphanumeric characters plus dashes, dots, hashes, colons, semicolons, "
"underscores, exclamation marks, and parentheses"
)

def __init__(
self,
max_length: int = CONN_ID_MAX_LEN,
):
self.max_length = max_length

def __call__(self, form, field):
if field.data:
if sanitize_conn_id(field.data, self.max_length) is None:
raise ValidationError(f"{self.message} for 1 and up to {self.max_length} matches")
64 changes: 64 additions & 0 deletions tests/models/test_connection.py
Expand Up @@ -186,3 +186,67 @@ def test_parse_from_uri(
)
def test_get_uri(self, connection, expected_uri):
assert connection.get_uri() == expected_uri

@pytest.mark.parametrize(
"connection, expected_conn_id",
[
# a valid example of connection id
(
Connection(
conn_id="12312312312213___12312321",
conn_type="type",
login="user",
password="pass",
host="host",
port=100,
schema="schema",
extra={"param1": "val1", "param2": "val2"},
),
"12312312312213___12312321",
),
# an invalid example of connection id, which allows potential code execution
(
Connection(
conn_id="<script>alert(1)</script>",
conn_type="type",
host="protocol://host",
port=100,
schema="schema",
extra={"param1": "val1", "param2": "val2"},
),
None,
),
# a valid connection as well
(
Connection(
conn_id="a_valid_conn_id_!!##",
conn_type="type",
login="user",
password="pass",
host="protocol://host",
port=100,
schema="schema",
extra={"param1": "val1", "param2": "val2"},
),
"a_valid_conn_id_!!##",
),
# a valid connection as well testing dashes
(
Connection(
conn_id="a_-.11",
conn_type="type",
login="user",
password="pass",
host="protocol://host",
port=100,
schema="schema",
extra={"param1": "val1", "param2": "val2"},
),
"a_-.11",
),
],
)
# Responsible for ensuring that the sanitized connection id
# string works as expected.
def test_sanitize_conn_id(self, connection, expected_conn_id):
assert connection.conn_id == expected_conn_id