In [2]:
%pip install langchain pypdf langchain_openai langchain_chroma langchain_text_splitters tqdm python-dotenv markdown langchain-community

Collecting langchain
  Using cached langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Collecting langchain_openai
  Using cached langchain_openai-0.3.17-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain_chroma
  Using cached langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting langchain_text_splitters
  Using cached langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting markdown
  Using cached markdown-3.8-py3-none-any.whl.metadata (5.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Using cached langchain_core-0.3.60-py3-none-any.whl.metadata (5.8 kB)
Collecting langsmith<0.

In [3]:
import os
import glob
from tqdm import tqdm
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [6]:
load_dotenv()

# openai_api_key = os.getenv("OPENAI_API_KEY")
openai_api_key = "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"

## Define Functions

In [8]:
def pdf_to_markdown(pdf_path):
    """Load a PDF and convert its content to markdown format."""
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        # Extract filename as metadata
        filename = os.path.basename(pdf_path)
        
        # Convert to markdown and add metadata
        for doc in documents:
            doc.metadata["source"] = filename
            doc.metadata["format"] = "markdown"
        
        return documents
    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {str(e)}")
        return []

def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into manageable chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def store_in_chroma(chunks, collection_name="pdf_documents", persist_directory="./chroma_db"):
    """Store document chunks in ChromaDB using OpenAI embeddings."""
    # Initialize OpenAI embeddings
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    
    # Create or connect to ChromaDB
    db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_directory,
        collection_name=collection_name
    )
    
    # Persist the database
    # db.persist()
    
    return db

In [9]:
def process_pdfs(folder_path, chunk_size=1000, chunk_overlap=200, collection_name="pdf_documents"):
    """Process all PDFs in the folder and store them in ChromaDB."""
    # Check if folder exists
    if not os.path.exists(folder_path):
        raise ValueError(f"Folder path does not exist: {folder_path}")
    
    # Get all PDF files
    pdf_files = glob.glob(os.path.join(folder_path, "*.pdf"))
    
    if not pdf_files:
        logging.warning(f"No PDF files found in {folder_path}")
        return None
    
    logging.info(f"Found {len(pdf_files)} PDF files.")
    
    all_documents = []
    
    # Process each PDF
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        documents = pdf_to_markdown(pdf_file)
        all_documents.extend(documents)
    
    logging.info(f"Total document pages extracted: {len(all_documents)}")
    
    # Chunk documents
    chunks = chunk_documents(all_documents, chunk_size, chunk_overlap)
    logging.info(f"Total chunks created: {len(chunks)}")
    
    # Store in ChromaDB
    persist_directory = "./chroma_db"
    db = store_in_chroma(chunks, collection_name, persist_directory)
    
    logging.info(f"Documents successfully stored in ChromaDB at {persist_directory}")
    return db

In [12]:
folder_path = "./pdf_documents"

# Set chunk parameters
chunk_size = 1000 
chunk_overlap = 200

collection_name = "pdf_documents"

db = process_pdfs(folder_path, chunk_size, chunk_overlap, collection_name)

2025-05-19 04:39:19,453 - INFO - Found 8 PDF files.
Processing PDFs: 100%|██████████| 8/8 [00:12<00:00,  1.52s/it]
2025-05-19 04:39:31,654 - INFO - Total document pages extracted: 582
2025-05-19 04:39:31,702 - INFO - Total chunks created: 1151
2025-05-19 04:39:37,229 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-19 04:39:41,065 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-05-19 04:39:45,338 - INFO - Documents successfully stored in ChromaDB at ./chroma_db


In [13]:
def connect_to_db(persist_directory="./chroma_db", collection_name="pdf_documents"):
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    db = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings,
        collection_name=collection_name
    )
    return db

db = connect_to_db()
print(f"Number of documents in ChromaDB: {db._collection.count()}")

In [None]:
'''
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import chromadb

# Configure ChromaDB client for HTTP server
chroma_client = chromadb.HttpClient(
    host="localhost",  # Or your server address
    port=8000,
    headers={"X-Chroma-Auth": "your_password_here"}
)

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Connect to ChromaDB with HTTP client
db = Chroma(
    client=chroma_client,
    collection_name="pdf_documents",
    embedding_function=embeddings
)
'''