In [4]:
from pathlib import Path

# Your paths (you can change these if needed)
input_dir  = Path(r"C:\Users\datat\OneDrive\Books\9th Maths")
output_dir = Path(r"C:\Users\datat\OneDrive\Books\9th_combined")
output_pdf = output_dir / "9th_maths.pdf"

# Make sure output_dir exists
output_dir.mkdir(parents=True, exist_ok=True)

input_dir, output_dir, output_pdf


(WindowsPath('C:/Users/datat/OneDrive/Books/9th Maths'),
 WindowsPath('C:/Users/datat/OneDrive/Books/9th_combined'),
 WindowsPath('C:/Users/datat/OneDrive/Books/9th_combined/9th_maths.pdf'))

In [5]:
from dataclasses import dataclass, field
from typing import List, Iterable, Optional
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
from pathlib import Path
import re
from tqdm.auto import tqdm

def _natural_key(p: Path):
    """
    Natural sort key: splits filename into chunks of digits and non-digits
    so 'Chapter 9.pdf' < 'Chapter 10.pdf'.
    """
    parts = re.split(r'(\d+)', p.stem)
    key = []
    for part in parts:
        if part.isdigit():
            key.append(int(part))
        else:
            key.append(part.lower())
    return key, p.suffix.lower()

@dataclass
class PDFPipeline:
    input_dir: Path
    output_pdf: Path
    include_subfolders: bool = False
    order_mode: str = "natural"       # "natural" or "alphabetical"
    overwrite: bool = True
    skip_errors: bool = True          # skip corrupt/encrypted files if needed
    allow_blank_password: bool = True # try blank password on encrypted files
    plan: List[Path] = field(default_factory=list)

    def _read_order_file(self) -> Optional[List[Path]]:
        """
        If input_dir/ORDER.txt exists, read it line-by-line to get exact order.
        Each line should be a filename relative to input_dir.
        """
        order_file = self.input_dir / "ORDER.txt"
        if not order_file.exists():
            return None
        lines = [ln.strip() for ln in order_file.read_text(encoding="utf-8").splitlines() if ln.strip()]
        resolved = []
        for ln in lines:
            candidate = (self.input_dir / ln).resolve()
            if candidate.exists() and candidate.suffix.lower() == ".pdf":
                resolved.append(candidate)
        return resolved if resolved else None

    def discover(self) -> List[Path]:
        """
        Find PDFs in the input_dir (optionally subfolders).
        """
        if self.include_subfolders:
            files = list(self.input_dir.rglob("*.pdf"))
        else:
            files = list(self.input_dir.glob("*.pdf"))
        files = [f.resolve() for f in files]
        return files

    def order(self, files: Iterable[Path]) -> List[Path]:
        """
        Determine the order of PDFs.
        Priority:
        1) ORDER.txt if present
        2) order_mode: 'natural' (default) or 'alphabetical'
        """
        order_from_file = self._read_order_file()
        if order_from_file:
            # Use only files that actually exist in discovered set
            discovered = {f.resolve() for f in files}
            ordered = [p for p in order_from_file if p in discovered]
        else:
            if self.order_mode == "alphabetical":
                ordered = sorted(files, key=lambda p: (p.name.lower(), p.suffix.lower()))
            else:  # natural
                ordered = sorted(files, key=_natural_key)
        self.plan = ordered
        return ordered

    def _append_pdf(self, writer: PdfWriter, pdf_path: Path) -> int:
        """
        Append pages from a single PDF into the writer.
        Returns number of pages appended.
        """
        try:
            with open(pdf_path, "rb") as fh:
                reader = PdfReader(fh, strict=False)
                if getattr(reader, "is_encrypted", False):
                    # Try blank password if allowed
                    if self.allow_blank_password:
                        try:
                            reader.decrypt("")
                        except Exception:
                            if not self.skip_errors:
                                raise
                            return 0
                    else:
                        if not self.skip_errors:
                            raise PdfReadError(f"Encrypted file: {pdf_path}")
                        return 0

                count_before = len(writer.pages)
                for page in reader.pages:
                    writer.add_page(page)
                return len(writer.pages) - count_before

        except Exception as e:
            if not self.skip_errors:
                raise
            print(f"[WARN] Skipping '{pdf_path.name}': {e}")
            return 0

    def combine(self) -> dict:
        """
        Combine PDFs according to self.plan and write output.
        Returns a small report dict.
        """
        if not self.plan:
            raise ValueError("No plan available. Run discover() and order() first or run run(dry_run=True) to preview.")

        if self.output_pdf.exists() and not self.overwrite:
            raise FileExistsError(f"Output already exists and overwrite=False: {self.output_pdf}")

        writer = PdfWriter()
        total_pages = 0
        used_files = 0

        for pdf in tqdm(self.plan, desc="Merging PDFs"):
            added = self._append_pdf(writer, pdf)
            if added > 0:
                total_pages += added
                used_files += 1

        with open(self.output_pdf, "wb") as out_f:
            writer.write(out_f)

        return {
            "files_merged": used_files,
            "total_pages": total_pages,
            "output": str(self.output_pdf)
        }

    def run(self, dry_run: bool = False) -> List[Path] | dict:
        """
        Orchestrate: discover -> order -> (optionally) combine.
        If dry_run=True: returns the ordered file list (no write).
        Else: merges and returns a report dict.
        """
        files = self.discover()
        ordered = self.order(files)
        if dry_run:
            return ordered
        return self.combine()


In [6]:

pipeline = PDFPipeline(
    input_dir=input_dir,
    output_pdf=output_pdf,
    include_subfolders=False,  # set True if you want to include nested folders
    order_mode="natural",      # "natural" or "alphabetical"
    overwrite=True,            # set False to avoid accidental overwrite
    skip_errors=True,          # skip bad/encrypted files
    allow_blank_password=True  # try blank password for encrypted files
)

preview_list = pipeline.run(dry_run=True)
print(f"Found {len(preview_list)} PDFs. Merge order:")
for i, p in enumerate(preview_list, 1):
    print(f"{i:>3}. {p.name}")


Found 16 PDFs. Merge order:
  1. iemh1ps.pdf
  2. iemh101.pdf
  3. iemh102.pdf
  4. iemh103.pdf
  5. iemh104.pdf
  6. iemh105.pdf
  7. iemh106.pdf
  8. iemh107.pdf
  9. iemh108.pdf
 10. iemh109.pdf
 11. iemh110.pdf
 12. iemh111.pdf
 13. iemh112.pdf
 14. Z_iemh1a1.pdf
 15. Z_iemh1a2.pdf
 16. Z_iemh1an.pdf


In [7]:
report = pipeline.run(dry_run=False)
report


Merging PDFs:   0%|          | 0/16 [00:00<?, ?it/s]

{'files_merged': 16,
 'total_pages': 232,
 'output': 'C:\\Users\\datat\\OneDrive\\Books\\9th_combined\\9th_maths.pdf'}

In [8]:
from pypdf import PdfReader
r = PdfReader(str(output_pdf))
print(f"Output exists: {output_pdf.exists()} | Pages: {len(r.pages)}")


Output exists: True | Pages: 232
