Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve validation in HTTP parser #8074

Merged
merged 24 commits into from Jan 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
dc567ba
http_parser: test edge cases
pajod Dec 18, 2023
00cb2a2
http_parser: enforce HTTP version & header sep
pajod Dec 18, 2023
3b7e9eb
style: remove backslash from METHRE
pajod Dec 18, 2023
e0dde7c
http_parser: test edge cases (v2)
pajod Dec 18, 2023
e2b0483
use identical re.Pattern to validate method/header
pajod Dec 18, 2023
b38306b
FIXME: rename CHANGES/*.bugfix before merging
pajod Dec 18, 2023
ea85a3c
FIXME: rename CHANGES/*.bugfix before merging
pajod Dec 18, 2023
f0314ca
http_parser: test edge cases (v3)
pajod Dec 19, 2023
70cd60e
𝟭/𝟏/𝟷/1/𝟣/1/۱/𝟙
pajod Dec 19, 2023
4878638
typo, whitespace
pajod Dec 19, 2023
372fe5b
Update aiohttp/http_parser.py
Dreamsorcerer Jan 26, 2024
43c127a
Update http_parser.py
Dreamsorcerer Jan 26, 2024
4e8306c
xfail in tests: just expect no exception, these will remain unchanged
pajod Jan 26, 2024
9234224
example: additional unicode confusion
pajod Jan 26, 2024
39549e5
FIXME: rename CHANGES/*.bugfix before merging
pajod Jan 26, 2024
ddb4bc2
CI: 3.8 compat typings, silence but not un-parametrize regression test
pajod Jan 26, 2024
dd4bdc4
CI: un-parametrize the other broken request, too
pajod Jan 26, 2024
c38e243
style: name plus-suffixed patterns with plural S
pajod Jan 26, 2024
8cc25ff
style: must not name the one and only encoding
pajod Jan 26, 2024
75b4489
CI: not a spelling mistake
pajod Jan 26, 2024
d1b746f
FIXME: rename CHANGES/*.bugfix before merging
pajod Jan 26, 2024
480fc51
FIXME: rename CHANGES/*.bugfix before merging
pajod Jan 26, 2024
5ecbf79
CI: xfail the empty case for C parser
pajod Jan 26, 2024
6b82936
Rename 0.bugfix to 8074.bugfix.rst
Dreamsorcerer Jan 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES/8074.bugfix.rst
@@ -0,0 +1,5 @@
Fixed an unhandled exception in the Python HTTP parser on header lines starting with a colon -- by :user:`pajod`.

Invalid request lines with anything but a dot between the HTTP major and minor version are now rejected. Invalid header field names containing question mark or slash are now rejected. Such requests are incompatible with :rfc:`9110#section-5.6.2` and are not known to be of any legitimate use.

(BACKWARD INCOMPATIBLE)
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Expand Up @@ -262,6 +262,7 @@ Pankaj Pandey
Parag Jain
Pau Freixes
Paul Colomiets
Paul J. Dorn
Paulius Šileikis
Paulus Schoutsen
Pavel Kamaev
Expand Down
32 changes: 18 additions & 14 deletions aiohttp/http_parser.py
Expand Up @@ -69,12 +69,11 @@
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
# "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
# token = 1*tchar
METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
HDRRE: Final[Pattern[bytes]] = re.compile(
rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]"
)
HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+")


class RawRequestMessage(NamedTuple):
Expand Down Expand Up @@ -133,6 +132,7 @@
self, lines: List[bytes]
) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
headers: CIMultiDict[str] = CIMultiDict()
# note: "raw" does not mean inclusion of OWS before/after the field value
raw_headers = []

lines_idx = 1
Expand All @@ -146,13 +146,14 @@
except ValueError:
raise InvalidHeader(line) from None

if len(bname) == 0:
raise InvalidHeader(bname)

Check warning on line 150 in aiohttp/http_parser.py

View check run for this annotation

Codecov / codecov/patch

aiohttp/http_parser.py#L150

Added line #L150 was not covered by tests

# https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
raise InvalidHeader(line)

bvalue = bvalue.lstrip(b" \t")
if HDRRE.search(bname):
raise InvalidHeader(bname)
if len(bname) > self.max_field_size:
raise LineTooLong(
"request header name {}".format(
Expand All @@ -161,6 +162,9 @@
str(self.max_field_size),
str(len(bname)),
)
name = bname.decode("utf-8", "surrogateescape")
if not TOKENRE.fullmatch(name):
raise InvalidHeader(bname)

header_length = len(bvalue)

Expand Down Expand Up @@ -207,7 +211,6 @@
)

bvalue = bvalue.strip(b" \t")
name = bname.decode("utf-8", "surrogateescape")
value = bvalue.decode("utf-8", "surrogateescape")

# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
Expand Down Expand Up @@ -331,7 +334,8 @@

# Shouldn't allow +/- or other number formats.
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
if not length_hdr.strip(" \t").isdecimal():
# msg.headers is already stripped of leading/trailing wsp
if not DIGITS.fullmatch(length_hdr):
raise InvalidHeader(CONTENT_LENGTH)

return int(length_hdr)
Expand Down Expand Up @@ -559,7 +563,7 @@
)

# method
if not METHRE.fullmatch(method):
if not TOKENRE.fullmatch(method):
raise BadStatusLine(method)

# version
Expand Down Expand Up @@ -676,8 +680,8 @@
raise BadStatusLine(line)
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))

# The status code is a three-digit number
if len(status) != 3 or not status.isdecimal():
# The status code is a three-digit ASCII number, no padding
if len(status) != 3 or not DIGITS.fullmatch(status):
raise BadStatusLine(line)
status_i = int(status)

Expand Down Expand Up @@ -818,7 +822,7 @@
if self._lax: # Allow whitespace in lax mode.
size_b = size_b.strip()

if not re.fullmatch(HEXDIGIT, size_b):
if not re.fullmatch(HEXDIGITS, size_b):
exc = TransferEncodingError(
chunk[:pos].decode("ascii", "surrogateescape")
)
Expand Down
143 changes: 140 additions & 3 deletions tests/test_http_parser.py
Expand Up @@ -3,7 +3,8 @@

import asyncio
import re
from typing import Any, List
from contextlib import nullcontext
from typing import Any, Dict, List
from unittest import mock
from urllib.parse import quote

Expand Down Expand Up @@ -168,11 +169,27 @@
parser.feed_data(text)


@pytest.mark.parametrize(
"rfc9110_5_6_2_token_delim",
r'"(),/:;<=>?@[\]{}',
)
def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None:
text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode()
expectation = pytest.raises(http_exceptions.BadHttpMessage)
if rfc9110_5_6_2_token_delim == ":":
# Inserting colon into header just splits name/value earlier.
expectation = nullcontext()
with expectation:
parser.feed_data(text)


@pytest.mark.parametrize(
"hdr",
(
"Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
"Content-Length: +256",
"Content-Length: \N{superscript one}",
"Content-Length: \N{mathematical double-struck digit one}",
"Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
"Bar: abc\ndef",
"Baz: abc\x00def",
Expand Down Expand Up @@ -265,6 +282,20 @@
parser.feed_data(text)


def test_parse_unusual_request_line(parser: Any) -> None:
if not isinstance(response, HttpResponseParserPy):
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
text = b"#smol //a HTTP/1.3\r\n\r\n"
messages, upgrade, tail = parser.feed_data(text)
assert len(messages) == 1
msg, _ = messages[0]
assert msg.compression is None
assert not msg.upgrade
assert msg.method == "#smol"
assert msg.path == "//a"
assert msg.version == (1, 3)

Check warning on line 296 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L288-L296

Added lines #L288 - L296 were not covered by tests


def test_parse(parser: Any) -> None:
text = b"GET /test HTTP/1.1\r\n\r\n"
messages, upgrade, tail = parser.feed_data(text)
Expand Down Expand Up @@ -567,6 +598,45 @@
parser.feed_data(text)


_pad: Dict[bytes, str] = {
b"": "empty",
# not a typo. Python likes triple zero
b"\000": "NUL",
b" ": "SP",
b" ": "SPSP",
# not a typo: both 0xa0 and 0x0a in case of 8-bit fun
b"\n": "LF",
b"\xa0": "NBSP",
b"\t ": "TABSP",
}


@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"])
@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()])
@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()])
def test_invalid_header_spacing(
parser: Any, pad1: bytes, pad2: bytes, hdr: bytes
) -> None:
text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2)
expectation = pytest.raises(http_exceptions.BadHttpMessage)
if pad1 == pad2 == b"" and hdr != b"":
# one entry in param matrix is correct: non-empty name, not padded
expectation = nullcontext()
if pad1 == pad2 == hdr == b"":
if not isinstance(response, HttpResponseParserPy):
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
with expectation:
parser.feed_data(text)


def test_empty_header_name(parser: Any) -> None:
if not isinstance(response, HttpResponseParserPy):
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n"

Check warning on line 635 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L635

Added line #L635 was not covered by tests
with pytest.raises(http_exceptions.BadHttpMessage):
parser.feed_data(text)

Check warning on line 637 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L637

Added line #L637 was not covered by tests


def test_invalid_header(parser: Any) -> None:
text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n"
with pytest.raises(http_exceptions.BadHttpMessage):
Expand Down Expand Up @@ -689,6 +759,34 @@
assert r"\n" not in exc_info.value.message


_num: Dict[bytes, str] = {
# dangerous: accepted by Python int()
# unicodedata.category("\U0001D7D9") == 'Nd'
"\N{mathematical double-struck digit one}".encode(): "utf8digit",
# only added for interop tests, refused by Python int()
# unicodedata.category("\U000000B9") == 'No'
"\N{superscript one}".encode(): "utf8number",
"\N{superscript one}".encode("latin-1"): "latin1number",
}


@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
def test_http_request_bad_status_line_number(
parser: Any, nonascii_digit: bytes
) -> None:
text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n"
with pytest.raises(http_exceptions.BadStatusLine):
parser.feed_data(text)


def test_http_request_bad_status_line_separator(parser: Any) -> None:
# single code point, old, multibyte NFKC, multibyte NFKD
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @pajod, could you expand what this comment refers to? Does the utf8sep variable contain a value that matches all the listed cases? I'm rather confused. Or did you mean to test different cases but added just one?

Copy link
Contributor

@pajod pajod Jan 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That utf8sep has those properties (and LTR), a popular choice for being multiple edge cases in one. None of which are strictly needed for.. comparing to the literal ASCII dot as we do here, but some of which I expect to regain relevance on future refactoring.

utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode()
text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n"
with pytest.raises(http_exceptions.BadStatusLine):
parser.feed_data(text)


def test_http_request_bad_status_line_whitespace(parser: Any) -> None:
text = b"GET\n/path\fHTTP/1.1\r\n\r\n"
with pytest.raises(http_exceptions.BadStatusLine):
Expand All @@ -710,6 +808,31 @@
assert tail == b"some raw data"


def test_http_request_parser_utf8_request_line(parser: Any) -> None:
if not isinstance(response, HttpResponseParserPy):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pajod FYI it's much cleaner to use the @pytest.mark.xfail decorator since it allows pytest to make the decisions earlier in the process.

Copy link
Contributor

@pajod pajod Jan 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that only explains why a llhttp-only code coverage report complains!? There is something much more wrong here. Like, I completely broke that test levels of wrong.

Edit: Sorry, I did. Four times. I meant to acknowledge (in two cases, expected to be changed) behaviour differences of the C parser, while keeping my tests parametrized to keep running both parsers anyway. But each time I copied the not isinstance(response, HttpResponseParserPy) line where it should say not isinstance(parser, HttpRequestParserPy). And mypy would have told me, if not for that Any and the duplicate use of the response identifier (global scope function but also function scope variable)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I noticed there was no coverage on these tests for some reason. Something to look at later.

Copy link
Member

@webknjaz webknjaz Jan 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. I started looking into the coverage drop and that's how I noticed this thing.

FWIW we really should refactor how the tests with and without extensions are parametrized/generated, like I did in multidict recently. Essentially, there was an import loop in some places in tests that prevented C-extension tests from being executed (aio-libs/multidict#837 / aio-libs/multidict#915 / https://multidict.aio-libs.org/en/latest/changes/#contributor-facing-changes). So I fixed that by having an explicit global option for requiring one mode or the other, with a collection of fixtures reused everywhere and zero magic around import attempts and handling failures in weird ways.

Another thing I configured is the module classification in Codecov with different expected coverage thresholds — the goal should be that tests get 100% coverage in every CI run (from all jobs combined, of course). And the actual project code coverage should be measured as a separate metric. Currently, a global threshold value allows coverage to drop in tests if it's compensated by coverage in the project, meaning that we may be gaining more dead code (read: tests that are never executed), which results in a false sense of things being tested, when they aren't.

pytest.xfail("Regression test for Py parser. May match C behaviour later.")
messages, upgrade, tail = parser.feed_data(

Check warning on line 814 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L814

Added line #L814 was not covered by tests
# note the truncated unicode sequence
b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" +
# for easier grep: ASCII 0xA0 more commonly known as non-breaking space
# note the leading and trailing spaces
"sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} "
"\r\n\r\n".encode()
)
msg = messages[0][0]

Check warning on line 822 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L822

Added line #L822 was not covered by tests

assert msg.method == "GET"
assert msg.path == "/Pünktchen\udca0\udcef\udcb7"
assert msg.version == (1, 1)
assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")])
assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),)
assert not msg.should_close
assert msg.compression is None
assert not msg.upgrade
assert not msg.chunked
assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path

Check warning on line 833 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L824-L833

Added lines #L824 - L833 were not covered by tests


def test_http_request_parser_utf8(parser: Any) -> None:
text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode()
messages, upgrade, tail = parser.feed_data(text)
Expand Down Expand Up @@ -759,9 +882,15 @@
assert not msg.chunked


def test_http_request_parser_bad_method(parser: Any) -> None:
@pytest.mark.parametrize(
"rfc9110_5_6_2_token_delim",
[bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'],
)
def test_http_request_parser_bad_method(
parser: Any, rfc9110_5_6_2_token_delim: bytes
) -> None:
with pytest.raises(http_exceptions.BadStatusLine):
parser.feed_data(b'G=":<>(e),[T];?" /get HTTP/1.1\r\n\r\n')
parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n')


def test_http_request_parser_bad_version(parser: Any) -> None:
Expand Down Expand Up @@ -979,6 +1108,14 @@
response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n")


@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
def test_http_response_parser_code_not_ascii(
response: Any, nonascii_digit: bytes
) -> None:
with pytest.raises(http_exceptions.BadStatusLine):
response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n")


def test_http_request_chunked_payload(parser: Any) -> None:
text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n"
msg, payload = parser.feed_data(text)[0][0]
Expand Down