Skip to content

Commit

Permalink
feat: 0.1 🎉
Browse files Browse the repository at this point in the history
  • Loading branch information
aescarias committed Mar 30, 2024
1 parent d01c2fa commit ca3a981
Show file tree
Hide file tree
Showing 12 changed files with 95 additions and 65 deletions.
7 changes: 5 additions & 2 deletions pdfnaut/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""
PDFnaut is a Python library for parsing the structure of PDFs.
PDFnaut is a Python library for reading and writing PDFs at a low level.
"""

from .parsers import PdfParser
from .serializer import PdfSerializer

__all__ = ("PdfParser", "PdfSerializer")

__name__ = "pdfnaut"
__version__ = "0.1.0"
__description__ = "Parse and explore PDFs with ease"
__description__ = "Explore PDFs with ease"
__license__ = "Apache 2.0"
2 changes: 1 addition & 1 deletion pdfnaut/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .parsers.simple import WHITESPACE
from .exceptions import PdfFilterError
from .objects.base import PdfName, PdfIndirectRef
from .objects.base import PdfName


def predict_paeth(a: int, b: int, c: int) -> int:
Expand Down
16 changes: 9 additions & 7 deletions pdfnaut/objects/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from .base import (
PdfComment, PdfHexString, PdfIndirectRef, PdfName,
PdfNull, PdfObject, PdfOperator
)
from .base import (PdfComment, PdfHexString, PdfIndirectRef, PdfName,
PdfNull, PdfObject, PdfOperator)
from .stream import PdfStream
from .xref import (
CompressedXRefEntry, FreeXRefEntry, InUseXRefEntry,
PdfXRefEntry, PdfXRefSubsection, PdfXRefTable
from .xref import (CompressedXRefEntry, FreeXRefEntry, InUseXRefEntry,
PdfXRefEntry, PdfXRefSubsection, PdfXRefTable)

__all__ = (
"PdfComment", "PdfHexString", "PdfIndirectRef", "PdfName", "PdfNull",
"PdfObject", "PdfOperator", "PdfStream", "CompressedXRefEntry", "FreeXRefEntry",
"InUseXRefEntry", "PdfXRefEntry", "PdfXRefSubsection", "PdfXRefTable"
)
3 changes: 2 additions & 1 deletion pdfnaut/objects/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class PdfHexString:
(``§ 7.3.4.3 Hexadecimal Strings``)."""

raw: bytes
"""The raw (in hex) value of the string"""
"""The hex value of the string"""

def __post_init__(self) -> None:
# If uneven, we append a zero. (it's hexadecimal -- 2 chars = byte)
Expand All @@ -37,6 +37,7 @@ def __post_init__(self) -> None:

@classmethod
def from_raw(cls, data: bytes):
"""Creates a hexadecimal string from ``data``"""
return cls(hexlify(data))

@property
Expand Down
3 changes: 2 additions & 1 deletion pdfnaut/objects/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def decompress(self) -> bytes:
If no filter is defined, it returns the original contents.
Raises :class:`PdfFilterError` if a filter is unsupported."""
Raises :class:`.pdfnaut.exceptions.PdfFilterError` if a filter is unsupported."""

filters = self.details.get("Filter")
params = self.details.get("DecodeParms")

Expand Down
14 changes: 8 additions & 6 deletions pdfnaut/objects/xref.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,39 @@ class PdfXRefTable:
"""A cross-reference table which permits random access to objects across a PDF.
It is conformed of subsections indicating where objects are located. A PDF file
starts with one subsection and additional ones are added per each incremental update.
starts with one section (table) containing one subsection (or two if linearized).
Additional sections are added per each incremental update.
"""
sections: list[PdfXRefSubsection]


@dataclass
class PdfXRefSubsection:
"""A subsection part of an XRef table. Each subsection generally indicates
incremental updates to a document."""
"""A subsection part of an XRef table. A subsection includes ``count`` entries
whose object numbers start at ``first_obj_num`` and are incremented by one."""
first_obj_number: int
count: int
entries: list[PdfXRefEntry]


@dataclass
class FreeXRefEntry:
"""A Type 0 entry. These entries form the linked list of free objects."""
"""A Type 0 (f) entry. These entries are members of the linked list of free objects."""
next_free_object: int
gen_if_used_again: int


@dataclass
class InUseXRefEntry:
"""A Type 1 entry. These point to uncompressed entries currently in use."""
"""A Type 1 (n) entry. These entries point to indirect objects currently in use."""
offset: int
generation: int


@dataclass
class CompressedXRefEntry:
"""A Type 2 entry. These point to entries that are within an object stream."""
"""A Type 2 entry. These entries point to objects that are within an object stream
which is assumed "compressed" although it may not be."""
objstm_number: int
index_within: int

Expand Down
2 changes: 2 additions & 0 deletions pdfnaut/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from .pdf import PdfParser
from .simple import PdfTokenizer

__all__ = ("PdfParser", "PdfTokenizer")
18 changes: 9 additions & 9 deletions pdfnaut/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from ..objects.base import PdfNull, PdfIndirectRef, PdfObject, PdfName, PdfHexString
from ..objects.stream import PdfStream
from ..objects.xref import (
PdfXRefEntry, PdfXRefSubsection, PdfXRefTable, PdfXRefEntry, FreeXRefEntry,
CompressedXRefEntry, InUseXRefEntry
PdfXRefEntry, PdfXRefSubsection, PdfXRefTable,
FreeXRefEntry, InUseXRefEntry, CompressedXRefEntry
)
from ..exceptions import PdfParseError
from ..security_handler import StandardSecurityHandler
Expand All @@ -30,7 +30,7 @@ class PdfParser:
It consumes the PDF's cross-reference tables and trailers. It merges the tables
into a single one and provides an interface to individually parse each indirect
object using :class:`PdfTokenizer`."""
object using :class:`~pdfnaut.parsers.simple.PdfTokenizer`."""

def __init__(self, data: bytes) -> None:
self._tokenizer = PdfTokenizer(data)
Expand All @@ -42,7 +42,7 @@ def __init__(self, data: bytes) -> None:
self.trailer: dict[str, Any] = {}
"""The most recent trailer in the PDF document.
For details on the contents of the trailer, see § 7.5.5 File Trailer of the PDF spec.
For details on the contents of the trailer, see ``§ 7.5.5 File Trailer`` in the PDF spec.
"""

self.xref: dict[tuple[int, int], PdfXRefEntry] = {}
Expand All @@ -61,7 +61,7 @@ def __init__(self, data: bytes) -> None:
"""

self.security_handler = None
"""The document's standard security handler if any, as specified in the Encrypt
"""The document's standard security handler, if any, as specified in the Encrypt
dictionary of the PDF trailer.
This field being set indicates that a supported security handler was used for
Expand Down Expand Up @@ -199,7 +199,7 @@ def parse_simple_xref(self) -> PdfXRefTable:
``§ 7.5.4 Cross-Reference Table`` in the PDF spec.
If ``startxref`` points to an XRef object, :meth:`.parse_compressed_xref`
is called instead.
should be called instead.
"""
self._tokenizer.advance(4)
self._tokenizer.advance_whitespace()
Expand Down Expand Up @@ -315,7 +315,7 @@ def parse_indirect_object(self, xref_entry: InUseXRefEntry) -> PdfObject | PdfSt
length = self.resolve_reference(length)
self._tokenizer.position = _current
if not isinstance(length, int):
raise PdfParseError(f"\\Length entry of stream extent not an integer")
raise PdfParseError("\\Length entry of stream extent not an integer")

stream = PdfStream(tok, self.parse_stream(xref_entry, length))
if indirect_ref is None:
Expand Down Expand Up @@ -417,12 +417,12 @@ def resolve_reference(self, reference: PdfIndirectRef | tuple[int, int]):
"""Resolves a reference into the indirect object it points to.
Arguments:
reference (int | :class:`PdfIndirectRef`):
reference (int | :class:`.PdfIndirectRef`):
An indirect reference object or a tuple of two integers representing,
in order, the object number and the generation number.
Returns:
A PDF object if the reference was found, otherwise :class:`PdfNull`.
A PDF object if the reference was found, otherwise :class:`.PdfNull`.
"""

if isinstance(reference, tuple):
Expand Down
11 changes: 4 additions & 7 deletions pdfnaut/parsers/simple.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""A PDF tokenizer for objects"""
from __future__ import annotations

import re
from typing import Any

from ..objects.base import (
PdfHexString, PdfName, PdfNull, PdfComment,
PdfIndirectRef, PdfObject, PdfOperator
)
from ..objects.base import (PdfHexString, PdfName, PdfNull, PdfComment,
PdfIndirectRef, PdfObject, PdfOperator)

# as defined in § 7.2.2 Character Set, Table 1 & Table 2
DELIMITERS = b"()<>[]{}/%"
Expand All @@ -28,7 +25,7 @@

class PdfTokenizer:
"""A parser designed to consume objects that do not depend on cross reference
tables. It is used by :class:`PdfParser` for this purpose.
tables. It is used by :class:`~pdfnaut.parsers.pdf.PdfParser` for this purpose.
This parser will not parse indirect objects or streams because those do depend on XRef
and are effectively not sequentially parsable. Because of this limitation, it is not
Expand Down Expand Up @@ -240,7 +237,7 @@ def parse_dictionary(self) -> dict[str, Any]:
}

def parse_array(self) -> list[Any]:
"""Parses an array. Arrays are heterogenous in PDF so they are mapped to ``list``s."""
"""Parses an array. Arrays are heterogenous in PDF so they are mapped to Python lists."""
self.advance() # past the [
items: list[Any] = []

Expand Down
4 changes: 2 additions & 2 deletions pdfnaut/security_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def key_length(self) -> int:
return self.encryption.get("Length", 40) // 8

def compute_encryption_key(self, password: bytes) -> bytes:
"""Computes an encryption key as defined in ``§ 7.6.3.3 Encryption Key Algorithm`` >
``Algorithm 2: Computing an encryption key`` in the PDF spec."""
"""Computes an encryption key as defined in ``§ 7.6.3.3 Encryption Key Algorithm >
Algorithm 2: Computing an encryption key`` in the PDF spec."""
padded_password = password[:32] + PASSWORD_PADDING[:32 - len(password)]

psw_hash = md5(padded_password)
Expand Down
Loading

0 comments on commit ca3a981

Please sign in to comment.