load pdfs

In [1]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# data parsing using pypdfloader
try:
    pyPdfLoader = PyPDFLoader("data/pdf/SLMS.pdf")
    pypdf_docs = pyPdfLoader.load()
    print(pypdf_docs)

except Exception as e:
    print(f"error: {e}")

[Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': 'PyPDF', 'creationdate': '2025-12-23T10:51:41-08:00', 'moddate': '2025-12-23T10:51:41-08:00', 'subject': 'ACM Trans. Intell. Syst. Technol. 2025.16:1-87', 'title': 'A Comprehensive Survey of Small Language Models in the Era of Large Language Models: Techniques, Enhancements, Applications, Collaboration with LLMs, and Trustworthiness', 'source': 'data/pdf/SLMS.pdf', 'total_pages': 88, 'page': 0, 'page_label': '1'}, page_content='. \n. \n Latest updates: h\ue03cps://dl.acm.org/doi/10.1145/3768165\n. \n. \n SURVEY\nA Comprehensive Survey of Small Language Models\nin the Era of Large Language Models: Techniques,\nEnhancements, Applications, Collaboration with LLMs,\nand Trustworthiness\nFALI WANG, Pennsylvania State University, University Park, PA, United\nStates\n. \n ZHIWEI ZHANG, Pennsylvania State University, University Park, PA,\nUnited States\n. \n XIANREN ZHANG, Pennsylvania State University, University Park, PA,\nUn

In [3]:
# data parsing using pdfmupdfloader

try:
    pymuPDFLoader = PyMuPDFLoader("data/pdf/SLMS.pdf")
    pymupdf_docs = pymuPDFLoader.load()
    print(pymupdf_docs)
    print("1st page pdf content is")
    print(pymupdf_docs[0].page_content)


except Exception as e:
    print(f"error {e}")

[Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': '', 'creationdate': '2025-12-23T10:51:41-08:00', 'source': 'data/pdf/SLMS.pdf', 'file_path': 'data/pdf/SLMS.pdf', 'total_pages': 88, 'format': 'PDF 1.4', 'title': 'A Comprehensive Survey of Small Language Models in the Era of Large Language Models: Techniques, Enhancements, Applications, Collaboration with LLMs, and Trustworthiness', 'author': '', 'subject': 'ACM Trans. Intell. Syst. Technol. 2025.16:1-87', 'keywords': '', 'moddate': '2025-12-23T10:51:41-08:00', 'trapped': '', 'modDate': "D:20251223105141-08'00'", 'creationDate': "D:20251223105141-08'00'", 'page': 0}, page_content='.\n.\nLatest updates: h\ue03cps://dl.acm.org/doi/10.1145/3768165\n.\n.\nSURVEY\nA Comprehensive Survey of Small Language Models\nin the Era of Large Language Models: Techniques,\nEnhancements, Applications, Collaboration with LLMs,\nand Trustworthiness\nFALI WANG, Pennsylvania State University, University Park, PA, United\nStates\n.\nZHIWEI Z

In [4]:
raw_text= """
Here   is   a    small    paragraph
with   irregular spacing,
random   line breaks,     and    extra   spaces

that     don’t   quite   make   sense,
but   still   read   fine.

This     is     the     second     paragraph,
it    also   has     odd      spacing
and line   changes

scattered      throughout      to   make
cleaning     slightly      annoying.



"""

In [5]:
#basic text cleaner
def clean_text(text):
    text = " ".join(text.split())
    # replacing ligatures with normal text:
    # text.replace() => write whatever replacements you need

    return text

In [6]:
cleaned_text = clean_text(raw_text)
print("raw text:")
print(raw_text)

print("-----------------------")
print("cleaned text: ")
print(cleaned_text)

raw text:

Here   is   a    small    paragraph
with   irregular spacing,
random   line breaks,     and    extra   spaces

that     don’t   quite   make   sense,
but   still   read   fine.

This     is     the     second     paragraph,
it    also   has     odd      spacing
and line   changes

scattered      throughout      to   make
cleaning     slightly      annoying.




-----------------------
cleaned text: 
Here is a small paragraph with irregular spacing, random line breaks, and extra spaces that don’t quite make sense, but still read fine. This is the second paragraph, it also has odd spacing and line changes scattered throughout to make cleaning slightly annoying.


In [7]:
from langchain_core.documents import Document
from typing import List
class SmartPdfProcessor:
    "Advanced pdf processing with error handling"
    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            separators=[" "]
        )
         
    def process_pdf(self,pdf_path: str) -> List[Document] :
        """Process pdf with smart chunking and metadata enhancement"""
        loader = PyPDFLoader(pdf_path)
        pages= loader.load()

        processed_chunks=[]

        for page_num,page in enumerate(pages):
            cleaned_text=self.clean_text(page.page_content)

            if(len(cleaned_text.strip()) < 50):
                continue

            chunk = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num+1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count" : len(cleaned_text)
                }]
            )
            processed_chunks.extend(chunk)
        return processed_chunks
    
    def clean_text(self, text: str) -> str:
        """Clean the extracted text"""
        text = " ".join(text.split())

        # Common typographic ligatures found in PDFs
        ligatures = {
            "ﬀ": "ff",
            "ﬁ": "fi",
            "ﬂ": "fl",
            "ﬃ": "ffi",
            "ﬄ": "ffl",
            "ﬅ": "ft",
            "ﬆ": "st",
        }

        for ligature, replacement in ligatures.items():
            text = text.replace(ligature, replacement)

        return text


In [8]:
preprocessor = SmartPdfProcessor()

In [10]:
try:
    smart_chunks = preprocessor.process_pdf("data/pdf/SLMS.pdf")
    if smart_chunks:
        print("sample data for smart chunks")
        for key,value in smart_chunks[0].metadata.items():
            print(f"{key} : {value}")

except Exception as r:
    print(r)



sample data for smart chunks
producer : iText 4.2.0 by 1T3XT
creator : PyPDF
creationdate : 2025-12-23T10:51:41-08:00
moddate : 2025-12-23T10:51:41-08:00
subject : ACM Trans. Intell. Syst. Technol. 2025.16:1-87
title : A Comprehensive Survey of Small Language Models in the Era of Large Language Models: Techniques, Enhancements, Applications, Collaboration with LLMs, and Trustworthiness
source : data/pdf/SLMS.pdf
total_pages : 88
page : 1
page_label : 1
chunk_method : smart_pdf_processor
char_count : 1342
