feat(writer): Add basic PDF serializer

aescarias · Mar 27, 2024 · d01c2fa · d01c2fa
1 parent c4abc12
commit d01c2fa
Show file tree

Hide file tree

Showing 7 changed files with 381 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -1,10 +1,11 @@
 # pdfnaut
 
-> Warning: `pdfnaut` is currently in an early stage of development and has only been tested with a small set of compliant documents. Expect bugs or issues.
+> [!Warning]
+> pdfnaut is currently in an early stage of development and has only been tested with a small set of compliant documents. Expect bugs or issues.
 
-`pdfnaut` is a Python library for parsing PDF 1.7 files.
+pdfnaut is a Python library for parsing PDF 1.7 files.
 
-`pdfnaut` provides a low-level interface for reading and writing PDF objects as defined in the [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf). `pdfnaut` currently does not attempt to deviate from the specification. There's no guarantee that valid documents not fully conforming to the standard will be processed correctly.
+pdfnaut provides a low-level interface for reading and writing PDF objects as defined in the [PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf). pdfnaut currently does not attempt to deviate from the specification. There's no guarantee that valid documents not fully conforming to the standard will be processed correctly.
 
 ```py
 from pdfnaut import PdfParser

diff --git a/pdfnaut/exceptions.py b/pdfnaut/exceptions.py
@@ -6,3 +6,7 @@ class PdfParseError(Exception):
 class PdfFilterError(Exception):
     """A filter is unable to decode a stream or the filter is simply unsupported"""
     pass
+
+class PdfWriteError(Exception):
+    """The writer was unable to serialize an object"""
+    pass
diff --git a/pdfnaut/serializer.py b/pdfnaut/serializer.py
@@ -0,0 +1,244 @@
+from typing import Literal, Any
+from collections import defaultdict
+
+from .objects.stream import PdfStream
+from .objects.xref import PdfXRefSubsection, PdfXRefTable, FreeXRefEntry, InUseXRefEntry
+from .objects.base import PdfComment, PdfIndirectRef, PdfObject, PdfNull, PdfName, PdfHexString
+from .parsers.simple import STRING_ESCAPE
+from .exceptions import PdfWriteError
+
+def serialize_comment(comment: PdfComment) -> bytes:
+    return b"%" + comment.value
+
+def serialize_null(_) -> bytes:
+    return b"null"
+
+def serialize_bool(boolean: bool) -> bytes:
+    return b"true" if boolean else b"false"
+
+def serialize_literal_string(byte_str: bytes, *, keep_ascii: bool = False) -> bytes:
+    output = bytearray()
+    escape = {v: k for k, v in STRING_ESCAPE.items()}
+
+    # this is for handling unbalanced parentheses which must be escaped
+    paren_stack = []
+    unbalanced = []
+
+    for pos, char in enumerate(byte_str):
+        char = char.to_bytes(1)
+        if (esc := escape.get(char)) is not None and char not in b"()":
+            output += esc
+        elif keep_ascii and not char.isascii():
+            # \ddd notation
+            output += rf"\{ord(char):0>3o}".encode()
+        else:
+            output += char
+
+        # Balanced parentheses require no special treatment
+        if char == b"(":
+            paren_stack.append(pos)
+        elif char == b")":
+            if paren_stack:
+                paren_stack.pop()
+            else:
+                unbalanced.append(pos)
+
+    unbalanced.extend(paren_stack)
+    for pos in unbalanced:
+        output.insert(pos, ord("\\"))
+
+    return b"(" + output + b")"
+
+def serialize_name(name: PdfName) -> bytes:
+    output = b"/"
+
+    for char in name.value:
+        char = char.to_bytes(1)
+        if char.isalnum():
+            output += char
+        else:
+            output += rf"#{ord(char):x}".encode()
+
+    return output
+
+def serialize_hex_string(string: PdfHexString) -> bytes:
+    return b"<" + string.raw + b">"
+
+def serialize_indirect_ref(reference: PdfIndirectRef) -> bytes:
+    return f"{reference.object_number} {reference.generation} R".encode()
+
+def serialize_numeric(number: int | float) -> bytes:
+    return str(number).encode()
+
+def serialize_array(array: list[Any]) -> bytes:
+    return b"[" + b" ".join(serialize(item) for item in array) + b"]"
+
+def serialize_dictionary(mapping: dict[str, Any]) -> bytes:
+    items = []
+    for k, v in mapping.items():
+        items.append(serialize(PdfName(k.encode())))
+        items.append(serialize(v))
+
+    return b"<<" + b" ".join(items) + b">>"
+
+def serialize_stream(stream: PdfStream, *, eol: bytes) -> bytes:
+    output = serialize_dictionary(stream.details) + eol
+    output += b"stream" + eol
+    output += stream.raw + eol
+    output += b"endstream"
+
+    return output
+
+def serialize(object_: PdfObject | PdfStream | PdfComment, *, params: dict[str, Any] | None = None) -> bytes:
+    if params is None:
+        params = {}
+
+    if isinstance(object_, PdfComment):
+        return serialize_comment(object_)
+    elif isinstance(object_, PdfName):
+        return serialize_name(object_)
+    elif isinstance(object_, bytes):
+        return serialize_literal_string(object_, keep_ascii=params.get("keep_ascii", False))
+    elif isinstance(object_, bool):
+        return serialize_bool(object_)
+    elif isinstance(object_, PdfNull):
+        return serialize_null(object_)
+    elif isinstance(object_, PdfHexString):
+        return serialize_hex_string(object_)
+    elif isinstance(object_, PdfIndirectRef):
+        return serialize_indirect_ref(object_)
+    elif isinstance(object_, (int, float)):
+        return serialize_numeric(object_)
+    elif isinstance(object_, list):
+        return serialize_array(object_)
+    elif isinstance(object_, dict):
+        return serialize_dictionary(object_)
+    elif isinstance(object_, PdfStream):
+        return serialize_stream(object_, eol=params["eol"])
+
+    raise PdfWriteError(f"Cannot serialize type {type(object_)}")
+
+
+class PdfSerializer:
+    """A PDF serializer that can create a valid PDF document.
+    
+    Arguments:
+        eol (bytes, optional): 
+            The end-of-line to be used when serializing (CR, LF, or CRLF). Defaults to CRLF.
+    """
+
+    def __init__(self, *, eol: Literal[b"\r\n", b"\r", b"\n"] = b"\r\n") -> None:
+        self.content = b""
+        self.eol = eol
+
+        self.objects: dict[tuple[int, int], PdfObject | PdfStream] = {}
+
+    def write_header(self, version: str, *, with_binary_marker: bool = True) -> None:
+        """Appends the PDF file header to the document (``§ 7.5.2 File Header``)
+        
+        Arguments:
+            version (str): 
+                A string representing the version of the PDF file.
+
+            with_binary_marker (bool, optional):
+                Whether to also append the recommended binary marker. Defaults to True.
+        """
+
+        comment = PdfComment(f"PDF-{version}".encode())
+        self.content += serialize_comment(comment) + self.eol
+        if with_binary_marker:
+            marker = PdfComment(b"\xee\xe1\xf5\xf4") 
+            self.content += serialize_comment(marker) + self.eol
+
+    def write_object(self, reference: PdfIndirectRef | tuple[int, int], contents: PdfObject | PdfStream) -> int:
+        """Writes an indirect object to the stream.
+        
+        Arguments:
+            reference (:class:`PdfIndirectRef` | tuple[int, int]):
+                The object number and generation to which the object should be assigned.
+
+            contents (:class:`PdfObject` | :class:`PdfStream`):
+                The contents to associate with the reference.
+
+        Returns:
+            The offset where the indirect object starts.
+        """
+        if isinstance(reference, tuple):
+            reference = PdfIndirectRef(*reference)
+
+        offset = len(self.content)
+        self.content += f"{reference.object_number} {reference.generation} obj".encode() + self.eol
+        self.content += serialize(contents, params={ "eol": self.eol }) + self.eol
+        self.content += b"endobj" + self.eol
+
+        return offset
+
+    def generate_standard_xref_table(self, rows: list[tuple[str, int, int, int]]) -> PdfXRefTable:
+        """Generates an uncompressed cross-reference table from a list of ``rows``.
+        
+        Each row is a tuple of 4 values: a string that is either "f" (free) or "n" (in use);
+        the object number; the generation; and the value of the entry (next free or offset).
+
+        Returns:
+            An XRef table that can be serialized by :meth:`.write_standard_xref_table`.
+        """
+        table = PdfXRefTable([])
+        rows = sorted(rows, key=lambda sl: sl[1]) # sl[1] = object number
+
+        subsections = defaultdict(list)
+        first_obj_num = rows[0][1]
+
+        for entry in rows:
+            subsections[first_obj_num].append(entry)
+            if len(subsections[first_obj_num]) <= 1:
+                continue
+
+            _, first_key, *_ = subsections[first_obj_num][-1]
+            _, second_key, *_ = subsections[first_obj_num][-2]
+
+            if first_key != second_key and abs(first_key - second_key) != 1:
+                last = subsections[first_key].pop()
+                first_obj_num = last[1]
+                subsections[first_obj_num].append(last)
+
+        for first_obj_num, raw_entries in subsections.items():
+            entries = []
+            for (typ_, _obj_num, gen_num, offset) in raw_entries:
+                if typ_ == "f":
+                    entries.append(FreeXRefEntry(offset, gen_num))
+                else:
+                    entries.append(InUseXRefEntry(offset, gen_num))
+
+            table.sections.append(PdfXRefSubsection(first_obj_num, len(entries), entries))
+
+        return table
+
+    def write_standard_xref_table(self, table: PdfXRefTable) -> int:
+        """Writes an uncompressed XRef table (``§ 7.5.4 Cross-Reference Table``)
+        to the stream. Returns the ``startxref`` offset that should be written."""
+        startxref = len(self.content)
+        self.content += b"xref" + self.eol
+
+        for section in table.sections:
+            self.content += f"{section.first_obj_number} {section.count}".encode() + self.eol
+            for entry in section.entries:
+                if isinstance(entry, InUseXRefEntry):
+                    self.content += f"{entry.offset:0>10} {entry.generation:0>5} n".encode()
+                elif isinstance(entry, FreeXRefEntry):
+                    self.content += f"{entry.next_free_object:0>10} {entry.gen_if_used_again:0>5} f".encode()
+                else:
+                    raise PdfWriteError("Cannot write compressed XRef entry within uncompressed table")
+                self.content += self.eol
+
+        return startxref
+
+    def write_trailer(self, details: dict[str, Any], startxref: int) -> None:
+        """Writes the trailer of the PDF (``details``) and the ``startxref`` offset."""
+        self.content += b"trailer" + self.eol
+        self.content += serialize_dictionary(details) + self.eol
+        self.content += b"startxref" + self.eol
+        self.content += str(startxref).encode() + self.eol
+
+    def write_eof(self) -> None:
+        """Writes the End-Of-File marker"""
+        self.content += b"%%EOF" + self.eol
diff --git a/tests/test_encryption.py b/tests/test_encryption.py
@@ -1,9 +1,12 @@
+# Unit tests for PDF encryption routines and the Standard security handler
+
 from __future__ import annotations
 from typing import cast, Any
 
 from pdfnaut import PdfParser
 from pdfnaut.parsers.pdf import PermsAcquired
 
+
 def test_std_security_handler():
     with open("tests/docs/sample.pdf", "rb") as fp:
         parser = PdfParser(fp.read())
@@ -27,7 +30,10 @@ def test_std_security_handler():
         # but not 'some'
         assert parser.decrypt("some") is PermsAcquired.NONE
 
+
 def test_rc4_aes_decryption():
+    # TODO: A stream check wouldn't hurt?
+    # TODO: Some files have different StmF and StrF filters
     with open("tests/docs/encrypted-arc4.pdf", "rb") as fp:
         parser = PdfParser(fp.read())
         parser.parse()
@@ -43,4 +49,3 @@ def test_rc4_aes_decryption():
         parser.decrypt("nil")
         metadata = cast("dict[str, Any]", parser.resolve_reference(parser.trailer["Info"]))
         assert metadata["Producer"].value == b"pypdf"
-
diff --git a/tests/test_object_parsing.py b/tests/test_object_parsing.py
@@ -1,4 +1,4 @@
-# Unit tests for the PDF tokenizer
+# Unit tests for tokenizing the COS syntax in PDFs.
 
 from __future__ import annotations
 
@@ -15,17 +15,20 @@ def test_null_and_boolean() -> None:
     assert isinstance(tokens[0], PdfNull)
     assert tokens[1] is True and tokens[2] is False
 
+
 def test_numeric() -> None:
     parser = PdfTokenizer(b"-1 +25 46 -32.591 +52.871 3.1451")
     tokens = list(parser)
 
     assert tokens == [-1, 25, 46, -32.591, 52.871, 3.1451]
 
+
 def test_name_object() -> None:
     parser = PdfTokenizer(b"/Type /SomeR@ndomK*y /Lime#20Green / /F#23")
     tokens = list(parser)
-    assert tokens == [ PdfName(b"Type"), PdfName(b"SomeR@ndomK*y"), PdfName(b"Lime Green"), 
-                       PdfName(b""), PdfName(b"F#") ]
+    assert tokens == [PdfName(b"Type"), PdfName(b"SomeR@ndomK*y"), PdfName(b"Lime Green"), 
+                      PdfName(b""), PdfName(b"F#")]
+
 
 def test_literal_string() -> None:
     # Basic string
@@ -49,12 +52,14 @@ def test_literal_string() -> None:
     parser = PdfTokenizer(b"(This is a string with a \\t tab character and a \\053 plus.))")
     assert parser.next_token() == b"This is a string with a \t tab character and a + plus."
 
+
 def test_hex_string() -> None:
     parser = PdfTokenizer(b"<A5B2FF><6868ADE>")
     tokens = cast("list[PdfHexString]", list(parser))
 
     assert tokens[0].raw == b"A5B2FF" and tokens[1].raw == b"6868ADE0" 
 
+
 def test_dictionary() -> None:
     parser = PdfTokenizer(b"""<< /Type /Catalog /Metadata 2 0 R /Pages 3 0 R >>""")
     assert parser.next_token() == { 
@@ -63,6 +68,7 @@ def test_dictionary() -> None:
         "Pages": PdfIndirectRef(3, 0) 
     }
 
+
 def test_comment() -> None:
     # This also counts as an EOL test
     parser = PdfTokenizer(b"% This is a comment\r\n"
@@ -79,10 +85,11 @@ def test_comment() -> None:
     assert isinstance(com := parser.next_token(), PdfComment) \
         and com.value == b" This is a comment ending with \\r"
 
+
 def test_array() -> None:
     # Simple array
     parser = PdfTokenizer(b"[45 <</Size 40>> (42)]") 
-    assert parser.next_token() == [45, { "Size": 40 }, b"42"]
+    assert parser.next_token() == [45, {"Size": 40}, b"42"]
 
     # Nested array
     parser = PdfTokenizer(b"[/XYZ [45 32 76] /Great]")

diff --git a/tests/test_parsing_files.py b/tests/test_parsing_files.py
@@ -1,9 +1,11 @@
+# Unit tests for parsing a subset of handcrafted and example files.
 import pytest
 
 from pdfnaut.parsers import PdfParser
 from pdfnaut.objects import PdfStream, PdfIndirectRef
 from pdfnaut.exceptions import PdfParseError
 
+
 def test_simple_pdf() -> None:
     """Tests a simple PDF. In this context, "simple" means an unencrypted PDF 
     with no compression and few pages of content."""
@@ -22,6 +24,7 @@ def test_simple_pdf() -> None:
         first_page_contents = parser.resolve_reference(first_page["Contents"])
         assert isinstance(first_page_contents, PdfStream)
 
+
 def test_invalid_pdfs() -> None:
     """Tests invalid PDF scenarios. The cases included should all fail."""
     # "PDF" with no header
@@ -36,6 +39,7 @@ def test_invalid_pdfs() -> None:
             parser.parse()
             parser.resolve_reference(PdfIndirectRef(1, 0))
 
+
 def test_pdf_with_incremental() -> None:
     """Tests whether an incremental PDF is parsed correctly. Basically, whether the 
     correct trailer is provided and whether the XRefs are merged."""
@@ -46,6 +50,7 @@ def test_pdf_with_incremental() -> None:
         assert len(parser.update_xrefs) == 2 and len(parser._trailers) == 2
         assert parser.trailer["Size"] == len(parser.xref)
 
+
 def test_pdf_with_xref_stream() -> None:
     """Tests a PDF document with a compressed XRef stream"""
     with open("tests/docs/compressed-xref.pdf", "rb") as data: