Skip to content

Commit

Permalink
Fix unicode errors (#7044)
Browse files Browse the repository at this point in the history
## What do these changes do?

Fix few errors related to Unicode decoding.
* multipart forms with invalid utf-8 characters as data can cause a
UnicodeDecodeError to be raised. This should be raised as ValueError,
like what is done in other parts of the code base.
* HTTP request parser (pure-python) tries to decode the header name with
utf-8/xmlcharrefreplace, which cannot decode bytes such as `\xd9`.

## Are there changes in behavior for the user?

I don't think so.

## Related issue number

<!-- Are there any issues opened that will be resolved by merging this
change? -->

## Checklist

- [x] I think the code is well written
- [x] Unit tests for the changes exist
- [ ] ~Documentation reflects the changes~
- [ ] If you provide code modification, please add yourself to
`CONTRIBUTORS.txt`
  * The format is &lt;Name&gt; &lt;Surname&gt;.
  * Please keep alphabetical order, the file is sorted by names.
- [x] ~Add a new news fragment into the `CHANGES` folder~
  * name it `<issue_id>.<type>` for example (588.bugfix)
* if you don't have an `issue_id` change it to the pr id after creating
the pr
  * ensure type is one of the following:
    * `.feature`: Signifying a new feature.
    * `.bugfix`: Signifying a bug fix.
    * `.doc`: Signifying a documentation improvement.
    * `.removal`: Signifying a deprecation or removal of public API.
* `.misc`: A ticket has been closed, but it is not of interest to users.
* Make sure to use full sentences with correct case and punctuation, for
example: "Fix issue with non-ascii contents in doctest text files."
  • Loading branch information
ret2libc committed Nov 20, 2022
1 parent c0a7666 commit bce6e3c
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGES/7044.bugfix
@@ -0,0 +1 @@
Avoid raising UnicodeDecodeError in multipart and in HTTP headers parsing.
6 changes: 3 additions & 3 deletions aiohttp/http_parser.py
Expand Up @@ -154,7 +154,7 @@ def parse_headers(
if len(bname) > self.max_field_size:
raise LineTooLong(
"request header name {}".format(
bname.decode("utf8", "xmlcharrefreplace")
bname.decode("utf8", "backslashreplace")
),
str(self.max_field_size),
str(len(bname)),
Expand All @@ -176,7 +176,7 @@ def parse_headers(
if header_length > self.max_field_size:
raise LineTooLong(
"request header field {}".format(
bname.decode("utf8", "xmlcharrefreplace")
bname.decode("utf8", "backslashreplace")
),
str(self.max_field_size),
str(header_length),
Expand All @@ -197,7 +197,7 @@ def parse_headers(
if header_length > self.max_field_size:
raise LineTooLong(
"request header field {}".format(
bname.decode("utf8", "xmlcharrefreplace")
bname.decode("utf8", "backslashreplace")
),
str(self.max_field_size),
str(header_length),
Expand Down
7 changes: 6 additions & 1 deletion aiohttp/multipart.py
Expand Up @@ -462,8 +462,13 @@ async def form(self, *, encoding: Optional[str] = None) -> List[Tuple[str, str]]
real_encoding = encoding
else:
real_encoding = self.get_charset(default="utf-8")
try:
decoded_data = data.rstrip().decode(real_encoding)
except UnicodeDecodeError:
raise ValueError("data cannot be decoded with %s encoding" % real_encoding)

return parse_qsl(
data.rstrip().decode(real_encoding),
decoded_data,
keep_blank_values=True,
encoding=real_encoding,
)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_http_parser.py
Expand Up @@ -117,6 +117,14 @@ def test_parse_headers(parser: Any) -> None:
assert not msg.upgrade


def test_parse_headers_longline(parser: Any) -> None:
invalid_unicode_byte = b"\xd9"
header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192
text = b"GET /test HTTP/1.1\r\n" + header_name + b": test\r\n" + b"\r\n" + b"\r\n"
with pytest.raises((http_exceptions.LineTooLong, http_exceptions.BadHttpMessage)):
parser.feed_data(text)


def test_parse(parser: Any) -> None:
text = b"GET /test HTTP/1.1\r\n\r\n"
messages, upgrade, tail = parser.feed_data(text)
Expand Down
15 changes: 15 additions & 0 deletions tests/test_multipart.py
Expand Up @@ -599,6 +599,21 @@ async def test_read_form(self, newline: Any) -> None:
result = await obj.form()
assert [("foo", "bar"), ("foo", "baz"), ("boo", "")] == result

async def test_read_form_invalid_utf8(self, newline: Any) -> None:
invalid_unicode_byte = b"\xff"
data = invalid_unicode_byte + b"%s--:--" % newline
with Stream(data) as stream:
obj = aiohttp.BodyPartReader(
BOUNDARY,
{CONTENT_TYPE: "application/x-www-form-urlencoded"},
stream,
_newline=newline,
)
with pytest.raises(
ValueError, match="data cannot be decoded with utf-8 encoding"
):
await obj.form()

async def test_read_form_encoding(self, newline: Any) -> None:
data = b"foo=bar&foo=baz&boo=%s--:--" % newline
with Stream(data) as stream:
Expand Down

0 comments on commit bce6e3c

Please sign in to comment.