In [22]:
import os
import hashlib
from typing import List, Dict
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid
import re

In [4]:
PDFS_FOLDER = './pdfs/'
files_to_process = os.listdir(PDFS_FOLDER)
files_to_process = [f for f in files_to_process if f.endswith('.pdf')]
files_to_process = [os.path.join(PDFS_FOLDER, f) for f in files_to_process]
print(files_to_process)

['./pdfs/Attention_Is_All_You_Need.pdf', './pdfs/LONG SHORT-TERM MEMORY.pdf']


In [6]:
#
filepath = files_to_process[0]
print(f"Calculating MD5 for file: {filepath}")
hasher = hashlib.md5()
with open(filepath, "rb") as f:
    buf = f.read()
    hasher.update(buf)
print(hasher.hexdigest())

Calculating MD5 for file: ./pdfs/Attention_Is_All_You_Need.pdf
18e1b007a1dab45b30cc861ba2dfda25


In [8]:
RAG_PERSIST_DIR = './rag_vectorstore/'
if  os.path.exists(RAG_PERSIST_DIR):
     
    db = Chroma(persist_directory=RAG_PERSIST_DIR, embedding_function=OllamaEmbeddings(model="mxbai-embed-large"))
    docs = db.get(include=["metadatas"])
    existing = {}
    for meta in docs["metadatas"]:
        if "source" in meta and "hash" in meta:
            existing[os.path.basename(meta["source"])] = meta["hash"]
else:
    existing = {}

print(existing)

{}


In [23]:

def clean_text(text: str) -> str:
    """Remove unwanted breaks, headers, and spacing."""
    # Remove multiple line breaks
    text = re.sub(r'\n+', '\n', text)

    # Remove page numbers or isolated digits
    text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

    # Merge single newlines (within paragraphs)
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # Remove excessive spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


def normalize_text(text: str) -> str:
    """Normalize quotes, ligatures, etc."""
    text = text.replace("ﬁ", "fi").replace("ﬂ", "fl")
    text = text.replace("“", "\"").replace("”", "\"").replace("’", "'")
    return text


def remove_references(text: str) -> str:
    """Optionally remove reference or bibliography section."""
    parts = re.split(r'\bReferences\b|\bBibliography\b', text, flags=re.IGNORECASE)
    return parts[0] if parts else text

In [24]:
embeddings = OllamaEmbeddings(model="mxbai-embed-large")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80, is_separator_regex=False)
all_docs = []

In [25]:
loader = PyPDFLoader(filepath)
docs = loader.load()
CLEAN_REFERENCES = True
# Clean and normalize text
for d in docs:
    d.page_content = normalize_text(clean_text(d.page_content))
    if CLEAN_REFERENCES:
        d.page_content = remove_references(d.page_content)

chunks = text_splitter.split_documents(docs)
file_hash = hasher.hexdigest()
all_docs = []

for idx, chunk in enumerate(chunks):
    chunk.metadata["source"] = filepath
    chunk.metadata["hash"] = file_hash
    chunk.metadata["chunk_id"] = f"{os.path.basename(filepath)}_{idx:03d}"
    chunk.metadata["uuid"] = str(uuid.uuid4())  # globally unique identifier
    print(chunk.metadata)

all_docs.extend(chunks)

{'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './pdfs/Attention_Is_All_You_Need.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'hash': '18e1b007a1dab45b30cc861ba2dfda25', 'chunk_id': 'Attention_Is_All_You_Need.pdf_000', 'uuid': '084e61b4-455b-452c-adf6-1244d10368f1'}
{'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './pdfs/Attention_Is_All_You_Need.pdf', 'total_pages': 15, 'page': 0, 'pag

In [26]:
all_docs[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './pdfs/Attention_Is_All_You_Need.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'hash': '18e1b007a1dab45b30cc861ba2dfda25', 'chunk_id': 'Attention_Is_All_You_Need.pdf_000', 'uuid': '084e61b4-455b-452c-adf6-1244d10368f1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works. Attention Is All You Need Ashish Vaswani∗ Google Brain avaswani@google.com Noam Shazeer∗ Google Brain noam@google.com Niki Parmar∗ Google Research nikip@google.com Jakob Uszkoreit∗ Google Research usz@google.com Lli

In [None]:








def get_new_pdfs(pdf_folder: str, existing_hashes: Dict[str, str]) -> List[str]:
    """Return a list of new or changed PDFs."""
    new_files = []
    for filename in os.listdir(pdf_folder):
        if not filename.lower().endswith(".pdf"):
            continue
        filepath = os.path.join(pdf_folder, filename)
        file_hash = get_file_hash(filepath)
        if filename not in existing_hashes or existing_hashes[filename] != file_hash:
            new_files.append(filepath)
    return new_files


def process_pdfs(pdf_files: List[str], persist_dir: str):
    """Loads PDFs, chunks text, creates embeddings, and saves to Chroma."""
    if not pdf_files:
        print("✅ No new PDFs found.")
        return

    print(f"📄 Found {len(pdf_files)} new PDF(s): {[os.path.basename(f) for f in pdf_files]}")

    embeddings = OllamaEmbeddings(model="mxbai-embed-large")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
    all_docs = []

    for filepath in pdf_files:
        loader = PyPDFLoader(filepath)
        docs = loader.load()
        chunks = text_splitter.split_documents(docs)
        file_hash = get_file_hash(filepath)
        for chunk in chunks:
            chunk.metadata["source"] = filepath
            chunk.metadata["hash"] = file_hash
        all_docs.extend(chunks)

    print("🔢 Creating embeddings and updating Chroma...")
    db = Chroma(persist_directory=persist_dir, embedding_function=embeddings)
    db.add_documents(all_docs)
    db.persist()
    print("✅ Update complete!")


def update_rag(pdf_folder: str, persist_dir: str):
    """Main function to detect and add new PDFs to the RAG."""
    print("🔍 Checking for new PDFs...")
    existing = load_existing_metadata(persist_dir)
    new_pdfs = get_new_pdfs(pdf_folder, existing)
    process_pdfs(new_pdfs, persist_dir)


if __name__ == "__main__":
    PDF_FOLDER = "./pdfs"
    PERSIST_DIR = "./chroma_db"

    os.makedirs(PDF_FOLDER, exist_ok=True)
    os.makedirs(PERSIST_DIR, exist_ok=True)

    update_rag(PDF_FOLDER, PERSIST_DIR)
