Support reading multipart data with \n (LF) lines (#3492)

* Support reading multipart data with `\n` (`LF`) lines While RFC clearly says about `CRLF` newlines, there quite a lot of implementations which uses just `LF`. Even Python's stdlib produces multiparts with `\n` newlines by default for compatibility reasons. We wouldn't change how we produce multipart content - here we follow RFC. However, we can detect `\n` lines quite easily which makes their support quite cheap. * Add test about mixed newlines Just for case. That's a strange case, but it seems we pass it. * Make newline argument as keyword one and explicitly private one This argument is not designed to be defined by users. Depending on parsing multipart newline format it will be chosen automatically and due to recursive native of multipart format it have to be passed around for nested readers.
aio-libs · Jan 15, 2019 · fcedc66 · fcedc66
1 parent 8fbe7a1
commit fcedc66
Show file tree

Hide file tree

Showing 3 changed files with 429 additions and 208 deletions.
diff --git a/CHANGES/2302.feature b/CHANGES/2302.feature
@@ -0,0 +1 @@
+Support reading multipart data with `\n` (`LF`) lines
diff --git a/aiohttp/multipart.py b/aiohttp/multipart.py
@@ -237,11 +237,17 @@ class BodyPartReader:
 
     chunk_size = 8192
 
-    def __init__(self, boundary: bytes,
-                 headers: Mapping[str, Optional[str]],
-                 content: StreamReader) -> None:
+    def __init__(
+        self,
+        boundary: bytes,
+        headers: Mapping[str, Optional[str]],
+        content: StreamReader,
+        *,
+        _newline: bytes = b'\r\n'
+    ) -> None:
         self.headers = headers
         self._boundary = boundary
+        self._newline = _newline
         self._content = content
         self._at_eof = False
         length = self.headers.get(CONTENT_LENGTH, None)
@@ -300,8 +306,8 @@ async def read_chunk(self, size: int=chunk_size) -> bytes:
         if self._read_bytes == self._length:
             self._at_eof = True
         if self._at_eof:
-            clrf = await self._content.readline()
-            assert b'\r\n' == clrf, \
+            newline = await self._content.readline()
+            assert newline == self._newline, \
                 'reader did not read all the data or it is malformed'
         return chunk
 
@@ -328,11 +334,15 @@ async def _read_chunk_from_stream(self, size: int) -> bytes:
         assert self._content_eof < 3, "Reading after EOF"
         assert self._prev_chunk is not None
         window = self._prev_chunk + chunk
-        sub = b'\r\n' + self._boundary
+
+        intermeditate_boundary = self._newline + self._boundary
+
         if first_chunk:
-            idx = window.find(sub)
+            pos = 0
         else:
-            idx = window.find(sub, max(0, len(self._prev_chunk) - len(sub)))
+            pos = max(0, len(self._prev_chunk) - len(intermeditate_boundary))
+
+        idx = window.find(intermeditate_boundary, pos)
         if idx >= 0:
             # pushing boundary back to content
             with warnings.catch_warnings():
@@ -344,6 +354,7 @@ async def _read_chunk_from_stream(self, size: int) -> bytes:
             chunk = window[len(self._prev_chunk):idx]
             if not chunk:
                 self._at_eof = True
+
         result = self._prev_chunk
         self._prev_chunk = chunk
         return result
@@ -372,7 +383,8 @@ async def readline(self) -> bytes:
         else:
             next_line = await self._content.readline()
             if next_line.startswith(self._boundary):
-                line = line[:-2]  # strip CRLF but only once
+                # strip newline but only once
+                line = line[:-len(self._newline)]
             self._unread.append(next_line)
 
         return line
@@ -516,10 +528,16 @@ class MultipartReader:
     #: Body part reader class for non multipart/* content types.
     part_reader_cls = BodyPartReader
 
-    def __init__(self, headers: Mapping[str, str],
-                 content: StreamReader) -> None:
+    def __init__(
+        self,
+        headers: Mapping[str, str],
+        content: StreamReader,
+        *,
+        _newline: bytes = b'\r\n'
+    ) -> None:
         self.headers = headers
         self._boundary = ('--' + self._get_boundary()).encode()
+        self._newline = _newline
         self._content = content
         self._last_part = None
         self._at_eof = False
@@ -592,9 +610,13 @@ def _get_part_reader(self, headers: 'CIMultiDictProxy[str]') -> Any:
         if mimetype.type == 'multipart':
             if self.multipart_reader_cls is None:
                 return type(self)(headers, self._content)
-            return self.multipart_reader_cls(headers, self._content)
+            return self.multipart_reader_cls(
+                headers, self._content, _newline=self._newline
+            )
         else:
-            return self.part_reader_cls(self._boundary, headers, self._content)
+            return self.part_reader_cls(
+                self._boundary, headers, self._content, _newline=self._newline
+            )
 
     def _get_boundary(self) -> str:
         mimetype = parse_mimetype(self.headers[CONTENT_TYPE])
@@ -625,6 +647,11 @@ async def _read_until_first_boundary(self) -> None:
             if chunk == b'':
                 raise ValueError("Could not find starting boundary %r"
                                  % (self._boundary))
+            if chunk.startswith(self._boundary):
+                _, newline = chunk.split(self._boundary, 1)
+                assert newline in (b'\r\n', b'\n')
+                self._newline = newline
+
             chunk = chunk.rstrip()
             if chunk == self._boundary:
                 return