Fix unicode errors (#7044)

## What do these changes do? Fix few errors related to Unicode decoding. * multipart forms with invalid utf-8 characters as data can cause a UnicodeDecodeError to be raised. This should be raised as ValueError, like what is done in other parts of the code base. * HTTP request parser (pure-python) tries to decode the header name with utf-8/xmlcharrefreplace, which cannot decode bytes such as `\xd9`. ## Are there changes in behavior for the user? I don't think so. ## Related issue number  ## Checklist - [x] I think the code is well written - [x] Unit tests for the changes exist - [ ] ~Documentation reflects the changes~ - [ ] If you provide code modification, please add yourself to `CONTRIBUTORS.txt` * The format is <Name> <Surname>. * Please keep alphabetical order, the file is sorted by names. - [x] ~Add a new news fragment into the `CHANGES` folder~ * name it `<issue_id>.<type>` for example (588.bugfix) * if you don't have an `issue_id` change it to the pr id after creating the pr * ensure type is one of the following: * `.feature`: Signifying a new feature. * `.bugfix`: Signifying a bug fix. * `.doc`: Signifying a documentation improvement. * `.removal`: Signifying a deprecation or removal of public API. * `.misc`: A ticket has been closed, but it is not of interest to users. * Make sure to use full sentences with correct case and punctuation, for example: "Fix issue with non-ascii contents in doctest text files."
aio-libs · Nov 20, 2022 · bce6e3c · bce6e3c
1 parent c0a7666
commit bce6e3c
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 4 deletions.
diff --git a/CHANGES/7044.bugfix b/CHANGES/7044.bugfix
@@ -0,0 +1 @@
+Avoid raising UnicodeDecodeError in multipart and in HTTP headers parsing.
diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py
@@ -154,7 +154,7 @@ def parse_headers(
             if len(bname) > self.max_field_size:
                 raise LineTooLong(
                     "request header name {}".format(
-                        bname.decode("utf8", "xmlcharrefreplace")
+                        bname.decode("utf8", "backslashreplace")
                     ),
                     str(self.max_field_size),
                     str(len(bname)),
@@ -176,7 +176,7 @@ def parse_headers(
                     if header_length > self.max_field_size:
                         raise LineTooLong(
                             "request header field {}".format(
-                                bname.decode("utf8", "xmlcharrefreplace")
+                                bname.decode("utf8", "backslashreplace")
                             ),
                             str(self.max_field_size),
                             str(header_length),
@@ -197,7 +197,7 @@ def parse_headers(
                 if header_length > self.max_field_size:
                     raise LineTooLong(
                         "request header field {}".format(
-                            bname.decode("utf8", "xmlcharrefreplace")
+                            bname.decode("utf8", "backslashreplace")
                         ),
                         str(self.max_field_size),
                         str(header_length),

diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py
@@ -462,8 +462,13 @@ async def form(self, *, encoding: Optional[str] = None) -> List[Tuple[str, str]]
             real_encoding = encoding
         else:
             real_encoding = self.get_charset(default="utf-8")
+        try:
+            decoded_data = data.rstrip().decode(real_encoding)
+        except UnicodeDecodeError:
+            raise ValueError("data cannot be decoded with %s encoding" % real_encoding)
+
         return parse_qsl(
-            data.rstrip().decode(real_encoding),
+            decoded_data,
             keep_blank_values=True,
             encoding=real_encoding,
         )

diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py
@@ -117,6 +117,14 @@ def test_parse_headers(parser: Any) -> None:
     assert not msg.upgrade
 
 
+def test_parse_headers_longline(parser: Any) -> None:
+    invalid_unicode_byte = b"\xd9"
+    header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192
+    text = b"GET /test HTTP/1.1\r\n" + header_name + b": test\r\n" + b"\r\n" + b"\r\n"
+    with pytest.raises((http_exceptions.LineTooLong, http_exceptions.BadHttpMessage)):
+        parser.feed_data(text)
+
+
 def test_parse(parser: Any) -> None:
     text = b"GET /test HTTP/1.1\r\n\r\n"
     messages, upgrade, tail = parser.feed_data(text)

diff --git a/tests/test_multipart.py b/tests/test_multipart.py
@@ -599,6 +599,21 @@ async def test_read_form(self, newline: Any) -> None:
             result = await obj.form()
         assert [("foo", "bar"), ("foo", "baz"), ("boo", "")] == result
 
+    async def test_read_form_invalid_utf8(self, newline: Any) -> None:
+        invalid_unicode_byte = b"\xff"
+        data = invalid_unicode_byte + b"%s--:--" % newline
+        with Stream(data) as stream:
+            obj = aiohttp.BodyPartReader(
+                BOUNDARY,
+                {CONTENT_TYPE: "application/x-www-form-urlencoded"},
+                stream,
+                _newline=newline,
+            )
+            with pytest.raises(
+                ValueError, match="data cannot be decoded with utf-8 encoding"
+            ):
+                await obj.form()
+
     async def test_read_form_encoding(self, newline: Any) -> None:
         data = b"foo=bar&foo=baz&boo=%s--:--" % newline
         with Stream(data) as stream: