In [1]:
import os
import pandas as pd
from typing import List, Dict, Any
from langchain_core.documents import Document
from langchain.text_splitter import (RecursiveCharacterTextSplitter,
                                     CharacterTextSplitter,
                                     TokenTextSplitter)
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader
)

# PyPDFLoader

In [2]:
try:
    pypdf_loader = PyPDFLoader("../data/pdf/attention is all you need.pdf")
    pypdf_docs = pypdf_loader.load()
    # print(pypdf_docs[0].page_content[:100])
except Exception as e:
    print(f"Error: {e}")

In [3]:
print(len(pypdf_docs))
pypdf_docs[0].page_content[:100]

15


'Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and'

# PyMuDFLoader

In [4]:
try:
    pymupdf_loader = PyMuPDFLoader("../data/pdf/attention is all you need.pdf")
    pymupdf_docs = pymupdf_loader.load()
    # print(pymupdf_docs[:100])
except Exception as e:
    print(f"Error: {e}")

In [5]:
print(len(pymupdf_docs))
pymupdf_docs[0].page_content[:100]

15


'Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and'

# PDF Pre-processing

In [None]:
class PDFProcessor:
    chunk_size: int
    chunk_overlap: int

    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_overlap=self.chunk_overlap,
            chunk_size=self.chunk_size,
            separators=[" "]
        )

    def _clean_text(self, text):
        text = " ".join(text.split())
        return text

    def process_pdf(self, pdf_path:str) -> List[Document]:
        """Process PDF with chunking and metadata enhancement"""
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        processed_chunks = []

        for page_num, page in enumerate(pages):
            cleaned_text = self._clean_text(page.page_content)

            if len(cleaned_text.strip()) < 50:
                continue

            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num+1,
                    "total_pages": len(pages),
                    "chunk_method": "pdf processor",
                    "char_count": len(cleaned_text)
                }]
            )

            processed_chunks.extend(chunks)

        print(f"Number of chunks generated: {len(processed_chunks)}")

        return processed_chunks 

In [7]:
pdf_processor = PDFProcessor()
chunks = pdf_processor.process_pdf("../data/pdf/attention is all you need.pdf")
# chunks[0]

Number of chunks generated: 49


In [8]:
chunks[0].page_content[:100]

'Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and'

In [9]:
chunks[0].metadata

{'producer': 'pdfTeX-1.40.25',
 'creator': 'LaTeX with hyperref',
 'creationdate': '2024-04-10T21:11:43+00:00',
 'author': '',
 'keywords': '',
 'moddate': '2024-04-10T21:11:43+00:00',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
 'subject': '',
 'title': '',
 'trapped': '/False',
 'source': '../data/pdf/attention is all you need.pdf',
 'total_pages': 15,
 'page': 1,
 'page_label': '1',
 'chunk_method': 'pdf processor',
 'char_count': 2857}