##### Load PDF file

In [2]:
from langchain_community.document_loaders import(
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader
)


##### 1.PyPDFLoader

In [13]:
try:
    pypdf_loader = PyPDFLoader("data/pdf/attention.pdf")
    pypdf_docs = pypdf_loader.load()
    # print(pypdf_docs)
    print(f"Loaded {len(pypdf_docs)} pages")
    print(f"Page 1 content: {pypdf_docs[0].page_content[:100]}")
    print(f"MetaDeta: {pypdf_docs[0].metadata}")

except Exception as e:
    print("Error",e)


Loaded 11 pages
Page 1 content: Attention Is All Y ou Need
Ashish V aswani∗
Google Brain
avaswani@google.com
Noam Shazeer ∗
Google B
MetaDeta: {'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'subject': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms. We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 

##### 2.PyMuPDFLoader

In [14]:
try:
    pymupdf_loader = PyMuPDFLoader("data/pdf/attention.pdf")
    pymupdf_docs = pymupdf_loader.load()
    # print(pypdf_docs)
    print(f"Loaded {len(pymupdf_docs)} pages")
    print(f"Page 1 content: {pymupdf_docs[0].page_content[:100]}")
    print(f"MetaDeta: {pymupdf_docs[0].metadata}")

except Exception as e:
    print("Error",e)


Loaded 11 pages
Page 1 content: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brai
MetaDeta: {'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'data/pdf/attention.pdf', 'file_path': 'data/pdf/attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'subject': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms. We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more paralle

###### Handling PDF challenges

In [4]:
# Example of raw PDF extraction
raw_pdf_text = """Company Financial Report


the finanacial performance for fiscal year 2025
shows significant growth in probability.

Revenue increased by 25%

The company's efficiency improved due to workflow
optimization


Page  1 of 10
"""

def clean_text(text):
    text = " ".join(text.split())

    text = text.replace("fl","fi")
    text = text.replace("fL","fl")

    return text


cleaned_text = clean_text(raw_pdf_text)

print("Before:")
print(repr(raw_pdf_text[:100]))
print("After:")
print(repr(cleaned_text))


Before:
'Company Financial Report\n\n\nthe finanacial performance for fiscal year 2025\nshows significant growth '
After:
"Company Financial Report the finanacial performance for fiscal year 2025 shows significant growth in probability. Revenue increased by 25% The company's efficiency improved due to workfiow optimization Page 1 of 10"


In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
from langchain_core.documents import Document
class SmartPDFProcessor:
    "Advanced PDF processing with error handling"
    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "]
        )
    

    def process_pdf(self,pdf_path:str) -> list[Document]:
        # Load the pdf
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        processed_chunks = []

        for page_num,page in enumerate(pages):
            # clean the text
            cleaned_text = self._clean_text(page.page_content)
            
            if len(cleaned_text.strip()) < 50:
                continue

            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page":page_num+1,
                    "total_pages":len(pages),
                    "chunk_method":"smart_pdf_processor",
                    "char_count":len(cleaned_text)
                }]
            )

            processed_chunks.extend(chunks)

        return processed_chunks
    
    
    def _clean_text(self,text:str) -> str:
        "Cleaned extracted text"
        text = " ".join(text.split())
        text = text.replace("fI","fi")
        text = text.replace("fL","fl")

        return text   









In [12]:
preprocessor = SmartPDFProcessor()

In [17]:
try:
    smart_chunks = preprocessor.process_pdf("data/pdf/attention.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    if smart_chunks:
        print("\nSample chunk metadeta")
        for key,value in smart_chunks[0].metadata.items():
            print(f"{key} : {value}")

except Exception as e:
    print("Processing Error:",e)

Processed into 40 smart chunks

Sample chunk metadeta
producer : PyPDF2
creator : PyPDF
creationdate : 
title : Attention is All you Need
author : Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
subject : The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms. We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On E