### Data Ingestion Pipeline

In [7]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [None]:
## Read PDF file
def process_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob('**/*.pdf'))


    for pdf_file in pdf_files:
        print(f"Processing: {pdf_file}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type' ] = "pdf"
                
            all_documents.extend(documents)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    return all_documents

process_pdfs('../data')


Processing: ..\data\pdf_files\invoice_INV-20251105-001.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-002.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-003.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-004.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-005.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-006.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-007.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-008.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-009.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-010.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-011.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-012.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-013.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-014.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-015.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-016.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-017.p

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': '', 'creationdate': 'D:20251105025513', 'source': '..\\data\\pdf_files\\invoice_INV-20251105-001.pdf', 'file_path': '..\\data\\pdf_files\\invoice_INV-20251105-001.pdf', 'total_pages': 1, 'format': 'PDF 1.3', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20251105025513', 'page': 0, 'source_file': 'invoice_INV-20251105-001.pdf', 'file_type': 'pdf'}, page_content='Invoice\nInvoice Number: INV-20251105-001\nDate: 2025-11-05\nYour Company Name\n1234 Business St.\nCity, State, ZIP\nPhone: (123) 456-7890\nEmail: contact@yourcompany.com\nBill To:\nCustomer Name\n5678 Client Ave.\nCity, State, ZIP\nDescription\nQty\nUnit Price\nTotal\nProduct A\n2\n$10.00\n$20.00\nProduct B\n1\n$15.50\n$15.50\nService C\n3\n$7.25\n$21.75\nTotal Amount:\n$57.25\nPayment Terms: Due within 30 days. Late payments will incur a 5% late fee per month.'),
 

In [9]:
## Text into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= chunk_size, 
        chunk_overlap= chunk_overlap,
        length_function= len,
        separators=["\n\n", "\n", " ", ""]
        )
    split_documents =text_splitter.split_documents(documents)
    print(f"Total chunks created: {len(split_documents)}")


    if split_documents:
        print(f'\nExample chunk:')
        print(f"Content: {split_documents[0].page_content[:100]}...")
        print(f"Metadata: {split_documents[0].metadata}")

    return split_documents

In [13]:
all_pdf_documents = process_pdfs('../data/pdf_files')
chunks = split_documents(all_pdf_documents)
chunks

Processing: ..\data\pdf_files\invoice_INV-20251105-001.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-002.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-003.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-004.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-005.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-006.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-007.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-008.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-009.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-010.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-011.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-012.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-013.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-014.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-015.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-016.pdf
Processing: ..\data\pdf_files\invoice_INV-20251105-017.p

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': '', 'creationdate': 'D:20251105025513', 'source': '..\\data\\pdf_files\\invoice_INV-20251105-001.pdf', 'file_path': '..\\data\\pdf_files\\invoice_INV-20251105-001.pdf', 'total_pages': 1, 'format': 'PDF 1.3', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': 'D:20251105025513', 'page': 0, 'source_file': 'invoice_INV-20251105-001.pdf', 'file_type': 'pdf'}, page_content='Invoice\nInvoice Number: INV-20251105-001\nDate: 2025-11-05\nYour Company Name\n1234 Business St.\nCity, State, ZIP\nPhone: (123) 456-7890\nEmail: contact@yourcompany.com\nBill To:\nCustomer Name\n5678 Client Ave.\nCity, State, ZIP\nDescription\nQty\nUnit Price\nTotal\nProduct A\n2\n$10.00\n$20.00\nProduct B\n1\n$15.50\n$15.50\nService C\n3\n$7.25\n$21.75\nTotal Amount:\n$57.25\nPayment Terms: Due within 30 days. Late payments will incur a 5% late fee per month.'),
 

### Embeeding and VectorStore DB