# Load PDF

In [None]:
from langchain_community.document_loaders import(
PyPDFLoader,
PyMuPDFLoader,
UnstructuredPDFLoader
)

In [None]:
# PyPDFLoader
# used for standard text and PDFs
# page number preserved, basic text extraction may not work.

try:
    pypdf_loader = PyPDFLoader("data/pdf/attention.pdf")
    pypdf_docs = pypdf_loader.load()
   # print(pypdf_docs)

except Exception as e:
    print(f"error :{e}")


print(f"pages {len(pypdf_docs)}\n")
print(f"metadata {pypdf_docs[0].metadata} \n")
print(f"content {pypdf_docs[0].page_content[:100]} \n")

In [None]:
# PyMuPDFLoader
# more fast
# image and text extraction supported

try:
    pypdf_loader = PyMuPDFLoader("data/pdf/attention.pdf")
    pypdf_docs = pypdf_loader.load()
   # print(pypdf_docs)

except Exception as e:
    print(f"error :{e}")

print(f"pages {len(pypdf_docs)}\n")
print(f"metadata {pypdf_docs[0].metadata} \n")
print(f"content {pypdf_docs[0].page_content[:100]} \n")

# Reading Complex PDF 

In [None]:
raw_pdf_text = """Company Financial Report


    The ﬁnancial performance for ﬁscal year 2024
    shows signiﬁcant growth in proﬁtability.
    
    
    
    Revenue increased by 25%.
    
The company's efﬁciency improved due to workﬂow
optimization.


Page 1 of 10
"""

# Apply a clean function

def clean_text(text):
    # Remove excessive whitespace
    text = " ".join(text.split())
    
    # Fix ligatures
    text = text.replace("ﬁ", "fi")
    text = text.replace("ﬂ", "fl")
    
    return text

print("before :\n","-"*10)
print(raw_pdf_text[0:100])
print("after :\n","-"*10)
cleaned = clean_text(raw_pdf_text)
print(cleaned[0:100])

# PDF Processor Class

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List

In [None]:

# This smart processor load the PDF, read texts, clean the text, add the meta data and return the list of Document

class SmartPDFProcessor:
    """Advanced PDF processing with error handling"""
    def __init__(self, chunk_size=1000,chunk_overlap=100):
        self.chunk_size= chunk_size
        self.chunk_overlap= chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
          chunk_size = chunk_size,
          chunk_overlap = chunk_overlap,
          separators=[" "]  
        )

    def process_pdf(self, pdf_path:str) -> List[Document]:
      """Process PDF with smart chunking and metadata enhancement"""

      #load pdf
      loader = PyPDFLoader(pdf_path)
      pages = loader.load()

      processed_chunks = []

      for page_num, page in enumerate(pages):
        cleaned_text = self._clean_text(page.page_content)

        if len(cleaned_text.strip()) < 50:
           continue
        
        chunks = self.text_splitter.create_documents(
           texts=[cleaned_text],
           metadatas=[{
             "page":page_num+1,
             "total_page":len(pages),
             "chunk_method":"smart_pdf_processor",
             "char_count":len(cleaned_text)
           }]
        )

        processed_chunks.extend(chunks)

      return processed_chunks
    
    def _clean_text(self, text :str)-> str:
       text = " ".join(text.split())

       text = text.replace("ﬁ", "fi")
       text = text.replace("ﬂ", "fl")
    
       return text


In [None]:

# use smartPDFProcessor
preprocessor = SmartPDFProcessor()
preprocessor

try:
    chunked_docs = preprocessor.process_pdf("data/pdf/attention.pdf")
    print(f"processed into {len(chunked_docs)} smart chunk")

    if chunked_docs:
        chunk=chunked_docs[0]
        print(f"metadata of first item:\n {chunk.metadata}")
        print(f"content of first item:\n {chunk.page_content}")

except Exception as e:
    print(f"processing error: {e}")