### LOAD PDF FILES

In [2]:
from langchain_community.document_loaders import(
    PyPDFLoader,
    UnstructuredPDFLoader,
    PyMuPDFLoader
)

In [3]:
### pyPDFLoader
print("PY PDF LOADER")
try:
    pdf_loader = PyPDFLoader("data/pdf/rapport.pdf")
    pdf_docs = pdf_loader.load()
    print(pdf_docs)
except Exception as e:
    print(f"PyPDFLoader failed: {e}")

PY PDF LOADER
[Document(metadata={'producer': 'PyPDF', 'creator': 'Microsoft Word', 'creationdate': '2025-07-30T13:45:47-07:00', 'author': 'firdaous.charchaoui@etu.uae.ac.ma', 'moddate': '2025-07-30T13:45:47-07:00', 'source': 'data/pdf/rapport.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='1 \n \nRapport de Projet :  Pr√©diction des maladies \ncardiaques \nI. Introduction \n1. Contexte du Projet pour HealthCare Innovate \nLa d√©tection pr√©coce des maladies cardiovasculaires est un enjeu majeur pour l\'h√¥pital \nHealthCare Innovate. L\'afflux de patients et la complexit√© des profils cliniques rendent difficile \nla priorisation efficace de ceux qui b√©n√©ficieraient le plus d\'examens sp√©cialis√©s.  \nCe projet a √©t√© initi√© pour d√©terminer si l\'intelligence artificielle peut fournir une assistance \nfiable aux √©quipes m√©dicales en analysant les donn√©es patientes existantes pour estimer le \nrisque de maladie cardiaque. \n2. Objectifs Strat√©giques \n‚Ä¢

In [5]:
# Method 2: PyMuPDFLoader (Fast and accurate)
print("\nüìò PyMuPDFLoader")
try:
    pymupdf_loader = PyMuPDFLoader("data/pdf/rapport.pdf")
    pymupdf_docs = pymupdf_loader.load()

    print(f"  Loaded {len(pymupdf_docs)} pages")
    print("  Includes detailed metadata")
    print(pymupdf_docs)
except Exception as e:
    print(f"  Error: {e}")


üìò PyMuPDFLoader
  Error: name 'PyMuPDFLoader' is not defined


üìÑ PDF Loader Comparison:

PyPDFLoader:
  ‚úÖ Simple and reliable
  ‚úÖ Good for most PDFs
  ‚úÖ Preserves page numbers
  ‚ùå Basic text extraction
  Use when: Standard text PDFs

PyMuPDFLoader:
  ‚úÖ Fast processing
  ‚úÖ Good text extraction
  ‚úÖ Image extraction support
  Use when: Speed is important


In [6]:
# Example of raw PDF extraction
raw_pdf_text = """Company Financial Report

The financial performance for fiscal year 2024
shows significant growth in profitability.

Revenue increased by 25%.

The company's efficiency improved due to workflow
optimization.

Page 1 of 10
"""

# Apply the cleaning function
def clean_text(text):
    # Remove excessive whitespace
    text = " ".join(text.split())

    # Fix ligatures
    text = text.replace("Ô¨Å", "fi")
    text = text.replace("Ô¨Ç", "fl")

    return text


cleaned = clean_text(raw_pdf_text)
print("BEFORE:")
print(repr(raw_pdf_text[:100]))
print("\nAFTER:")
print(repr(cleaned[:100]))



BEFORE:
'Company Financial Report\n\nThe financial performance for fiscal year 2024\nshows significant growth in'

AFTER:
'Company Financial Report The financial performance for fiscal year 2024 shows significant growth in '


In [7]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
from langchain_core.documents import Document
from typing import List

class SmartPDFProcessor:
    """Advanced PDF processing with error handling"""

    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "]
        )

    def process_pdf(self, pdf_path: str) -> List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""

        # Load PDF
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        ## Process each page
        processed_chunks = []

        for page_num, page in enumerate(pages):
            ## clean text
            cleaned_text = self._clean_text(page.page_content)

            # Skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )
            processed_chunks.extend(chunks)
        return processed_chunks
    
    def _clean_text(self, text: str) -> str:
        """clean extracted text"""

        # Remove excessive whitespace
        text = " ".join(text.split())

        # Fix ligatures
        text = text.replace("Ô¨Å", "fi")
        text = text.replace("Ô¨Ç", "fl")

        return text


In [9]:
preprocessor = SmartPDFProcessor()

In [10]:
## Process a PDF if available
try:
    smart_chunks = preprocessor.process_pdf("data/pdf/rapport.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    # Show enhanced metadata
    if smart_chunks:
        print("\nSample chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f"  {key}: {value}")

except Exception as e:
    print(f"Processing error: {e}")


Processed into 10 smart chunks

Sample chunk metadata:
  producer: PyPDF
  creator: Microsoft Word
  creationdate: 2025-07-30T13:45:47-07:00
  author: firdaous.charchaoui@etu.uae.ac.ma
  moddate: 2025-07-30T13:45:47-07:00
  source: data/pdf/rapport.pdf
  total_pages: 5
  page: 1
  page_label: 1
  chunk_method: smart_pdf_processor
  char_count: 1911
