feat: 0.1 🎉

aescarias · Mar 30, 2024 · ca3a981 · ca3a981
1 parent d01c2fa
commit ca3a981
Show file tree

Hide file tree

Showing 12 changed files with 95 additions and 65 deletions.
diff --git a/pdfnaut/__init__.py b/pdfnaut/__init__.py
@@ -1,10 +1,13 @@
 """
-PDFnaut is a Python library for parsing the structure of PDFs.
+PDFnaut is a Python library for reading and writing PDFs at a low level.
 """
 
 from .parsers import PdfParser
+from .serializer import PdfSerializer
+
+__all__ = ("PdfParser", "PdfSerializer")
 
 __name__ = "pdfnaut"
 __version__ = "0.1.0"
-__description__ = "Parse and explore PDFs with ease"
+__description__ = "Explore PDFs with ease"
 __license__ = "Apache 2.0"
diff --git a/pdfnaut/filters.py b/pdfnaut/filters.py
@@ -10,7 +10,7 @@
 
 from .parsers.simple import WHITESPACE
 from .exceptions import PdfFilterError
-from .objects.base import PdfName, PdfIndirectRef
+from .objects.base import PdfName
 
 
 def predict_paeth(a: int, b: int, c: int) -> int:

diff --git a/pdfnaut/objects/__init__.py b/pdfnaut/objects/__init__.py
@@ -1,9 +1,11 @@
-from .base import (
-    PdfComment, PdfHexString, PdfIndirectRef, PdfName, 
-    PdfNull, PdfObject, PdfOperator
-)
+from .base import (PdfComment, PdfHexString, PdfIndirectRef, PdfName, 
+                   PdfNull, PdfObject, PdfOperator)
 from .stream import PdfStream
-from .xref import (
-    CompressedXRefEntry, FreeXRefEntry, InUseXRefEntry,
-    PdfXRefEntry, PdfXRefSubsection, PdfXRefTable
+from .xref import (CompressedXRefEntry, FreeXRefEntry, InUseXRefEntry,
+                   PdfXRefEntry, PdfXRefSubsection, PdfXRefTable)
+
+__all__ = (
+    "PdfComment", "PdfHexString", "PdfIndirectRef", "PdfName", "PdfNull", 
+    "PdfObject", "PdfOperator", "PdfStream", "CompressedXRefEntry", "FreeXRefEntry", 
+    "InUseXRefEntry", "PdfXRefEntry", "PdfXRefSubsection", "PdfXRefTable"
 )
diff --git a/pdfnaut/objects/base.py b/pdfnaut/objects/base.py
@@ -28,7 +28,7 @@ class PdfHexString:
     (``§ 7.3.4.3 Hexadecimal Strings``)."""
 
     raw: bytes
-    """The raw (in hex) value of the string"""
+    """The hex value of the string"""
 
     def __post_init__(self) -> None:
         # If uneven, we append a zero. (it's hexadecimal -- 2 chars = byte)
@@ -37,6 +37,7 @@ def __post_init__(self) -> None:
 
     @classmethod
     def from_raw(cls, data: bytes):
+        """Creates a hexadecimal string from ``data``"""
         return cls(hexlify(data))
 
     @property

diff --git a/pdfnaut/objects/stream.py b/pdfnaut/objects/stream.py
@@ -20,7 +20,8 @@ def decompress(self) -> bytes:
         
         If no filter is defined, it returns the original contents.
         
-        Raises :class:`PdfFilterError` if a filter is unsupported."""
+        Raises :class:`.pdfnaut.exceptions.PdfFilterError` if a filter is unsupported."""
+
         filters = self.details.get("Filter")
         params = self.details.get("DecodeParms")
 

diff --git a/pdfnaut/objects/xref.py b/pdfnaut/objects/xref.py
@@ -9,37 +9,39 @@ class PdfXRefTable:
     """A cross-reference table which permits random access to objects across a PDF.
     
     It is conformed of subsections indicating where objects are located. A PDF file
-    starts with one subsection and additional ones are added per each incremental update.
+    starts with one section (table) containing one subsection (or two if linearized). 
+    Additional sections are added per each incremental update.
     """
     sections: list[PdfXRefSubsection]
 
 
 @dataclass
 class PdfXRefSubsection:
-    """A subsection part of an XRef table. Each subsection generally indicates 
-    incremental updates to a document."""
+    """A subsection part of an XRef table. A subsection includes ``count`` entries 
+    whose object numbers start at ``first_obj_num`` and are incremented by one."""
     first_obj_number: int
     count: int
     entries: list[PdfXRefEntry]
 
 
 @dataclass
 class FreeXRefEntry:
-    """A Type 0 entry. These entries form the linked list of free objects."""
+    """A Type 0 (f) entry. These entries are members of the linked list of free objects."""
     next_free_object: int
     gen_if_used_again: int
 
 
 @dataclass
 class InUseXRefEntry:
-    """A Type 1 entry. These point to uncompressed entries currently in use."""
+    """A Type 1 (n) entry. These entries point to indirect objects currently in use."""
     offset: int
     generation: int
 
 
 @dataclass
 class CompressedXRefEntry:
-    """A Type 2 entry. These point to entries that are within an object stream."""
+    """A Type 2 entry. These entries point to objects that are within an object stream 
+    which is assumed "compressed" although it may not be."""
     objstm_number: int
     index_within: int
 

diff --git a/pdfnaut/parsers/__init__.py b/pdfnaut/parsers/__init__.py
@@ -1,2 +1,4 @@
 from .pdf import PdfParser
 from .simple import PdfTokenizer
+
+__all__ = ("PdfParser", "PdfTokenizer")
diff --git a/pdfnaut/parsers/pdf.py b/pdfnaut/parsers/pdf.py
@@ -8,8 +8,8 @@
 from ..objects.base import PdfNull, PdfIndirectRef, PdfObject, PdfName, PdfHexString
 from ..objects.stream import PdfStream
 from ..objects.xref import (
-    PdfXRefEntry, PdfXRefSubsection, PdfXRefTable, PdfXRefEntry, FreeXRefEntry,
-    CompressedXRefEntry, InUseXRefEntry
+    PdfXRefEntry, PdfXRefSubsection, PdfXRefTable, 
+    FreeXRefEntry, InUseXRefEntry, CompressedXRefEntry
 )
 from ..exceptions import PdfParseError
 from ..security_handler import StandardSecurityHandler
@@ -30,7 +30,7 @@ class PdfParser:
     
     It consumes the PDF's cross-reference tables and trailers. It merges the tables
     into a single one and provides an interface to individually parse each indirect 
-    object using :class:`PdfTokenizer`."""
+    object using :class:`~pdfnaut.parsers.simple.PdfTokenizer`."""
 
     def __init__(self, data: bytes) -> None:
         self._tokenizer = PdfTokenizer(data)
@@ -42,7 +42,7 @@ def __init__(self, data: bytes) -> None:
         self.trailer: dict[str, Any] = {}
         """The most recent trailer in the PDF document.
         
-        For details on the contents of the trailer, see § 7.5.5 File Trailer of the PDF spec.
+        For details on the contents of the trailer, see ``§ 7.5.5 File Trailer`` in the PDF spec.
         """
 
         self.xref: dict[tuple[int, int], PdfXRefEntry] = {}
@@ -61,7 +61,7 @@ def __init__(self, data: bytes) -> None:
         """
 
         self.security_handler = None
-        """The document's standard security handler if any, as specified in the Encrypt 
+        """The document's standard security handler, if any, as specified in the Encrypt 
         dictionary of the PDF trailer.
 
         This field being set indicates that a supported security handler was used for
@@ -199,7 +199,7 @@ def parse_simple_xref(self) -> PdfXRefTable:
         ``§ 7.5.4 Cross-Reference Table`` in the PDF spec.
 
         If ``startxref`` points to an XRef object, :meth:`.parse_compressed_xref`
-        is called instead.
+        should be called instead.
         """
         self._tokenizer.advance(4)
         self._tokenizer.advance_whitespace()
@@ -315,7 +315,7 @@ def parse_indirect_object(self, xref_entry: InUseXRefEntry) -> PdfObject | PdfSt
                 length = self.resolve_reference(length)
                 self._tokenizer.position = _current 
             if not isinstance(length, int):
-                raise PdfParseError(f"\\Length entry of stream extent not an integer")
+                raise PdfParseError("\\Length entry of stream extent not an integer")
 
             stream = PdfStream(tok, self.parse_stream(xref_entry, length))
             if indirect_ref is None:
@@ -417,12 +417,12 @@ def resolve_reference(self, reference: PdfIndirectRef | tuple[int, int]):
         """Resolves a reference into the indirect object it points to.
         
         Arguments:
-            reference (int | :class:`PdfIndirectRef`): 
+            reference (int | :class:`.PdfIndirectRef`): 
                 An indirect reference object or a tuple of two integers representing, 
                 in order, the object number and the generation number.
 
         Returns:
-            A PDF object if the reference was found, otherwise :class:`PdfNull`.
+            A PDF object if the reference was found, otherwise :class:`.PdfNull`.
         """
 
         if isinstance(reference, tuple):

diff --git a/pdfnaut/parsers/simple.py b/pdfnaut/parsers/simple.py
@@ -1,13 +1,10 @@
-"""A PDF tokenizer for objects"""
 from __future__ import annotations
 
 import re
 from typing import Any
 
-from ..objects.base import (
-    PdfHexString, PdfName, PdfNull, PdfComment, 
-    PdfIndirectRef, PdfObject, PdfOperator
-)
+from ..objects.base import (PdfHexString, PdfName, PdfNull, PdfComment, 
+                            PdfIndirectRef, PdfObject, PdfOperator)
 
 # as defined in § 7.2.2 Character Set, Table 1 & Table 2
 DELIMITERS = b"()<>[]{}/%"
@@ -28,7 +25,7 @@
 
 class PdfTokenizer:
     """A parser designed to consume objects that do not depend on cross reference 
-    tables. It is used by :class:`PdfParser` for this purpose.
+    tables. It is used by :class:`~pdfnaut.parsers.pdf.PdfParser` for this purpose.
     
     This parser will not parse indirect objects or streams because those do depend on XRef 
     and are effectively not sequentially parsable. Because of this limitation, it is not 
@@ -240,7 +237,7 @@ def parse_dictionary(self) -> dict[str, Any]:
         } 
 
     def parse_array(self) -> list[Any]:
-        """Parses an array. Arrays are heterogenous in PDF so they are mapped to ``list``s."""
+        """Parses an array. Arrays are heterogenous in PDF so they are mapped to Python lists."""
         self.advance() # past the [ 
         items: list[Any] = []
 

diff --git a/pdfnaut/security_handler.py b/pdfnaut/security_handler.py
@@ -60,8 +60,8 @@ def key_length(self) -> int:
         return self.encryption.get("Length", 40) // 8
 
     def compute_encryption_key(self, password: bytes) -> bytes:  
-        """Computes an encryption key as defined in ``§ 7.6.3.3 Encryption Key Algorithm`` > 
-        ``Algorithm 2: Computing an encryption key`` in the PDF spec."""      
+        """Computes an encryption key as defined in ``§ 7.6.3.3 Encryption Key Algorithm > 
+        Algorithm 2: Computing an encryption key`` in the PDF spec."""      
         padded_password = password[:32] + PASSWORD_PADDING[:32 - len(password)]
 
         psw_hash = md5(padded_password)