In [2]:
import os
import json

In [3]:
keys = json.load(open("../config/keys.json"))

In [4]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.tracing.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = keys['LANGCHAIN_API_KEY']
os.environ['OPENAI_API_KEY'] = keys['OPENAI_API_KEY']

In [5]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import re

In [11]:
class DocumentLoader:
    """Handles loading and processing D&D PDF rulebooks using LangChain."""
    
    def __init__(self, output_dir):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        # Configure text splitter for chunking
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            is_separator_regex=False,
        )
    
    def load_pdf(self, pdf_path):
        """Load a single PDF file using LangChain's loader."""
        print(f"Loading {pdf_path}...")
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        # Add source metadata
        source_name = os.path.basename(pdf_path)
        for doc in documents:
            doc.metadata["source"] = source_name
            # Clean up potential page headers/footers
            doc.page_content = self._clean_text(doc.page_content)
        
        print(f"Loaded {len(documents)} pages from {source_name}")
        return documents
    
    def load_directory(self, directory_path, glob_pattern="**/*.pdf"):
        """Load all PDFs in a directory using LangChain's DirectoryLoader."""
        print(f"Loading PDFs from {directory_path}...")
        loader = DirectoryLoader(directory_path, glob=glob_pattern, loader_cls=PyPDFLoader)
        documents = loader.load()
        
        # Add additional metadata and clean text
        for doc in documents:
            # Clean text
            doc.page_content = self._clean_text(doc.page_content)
        
        print(f"Loaded {len(documents)} total pages from {directory_path}")
        return documents
    
    def split_documents(self, documents):
        """Split documents into chunks using the text splitter."""
        print(f"Splitting {len(documents)} documents into chunks...")
        chunks = self.text_splitter.split_documents(documents)
        print(f"Created {len(chunks)} chunks")
        return chunks
    
    def _clean_text(self, text):
        """Clean and normalize extracted text."""
        # Remove headers/footers
        lines = text.split('\n')
        filtered_lines = [line for line in lines if len(line.strip()) > 1]
        
        # Remove excessive whitespace
        text = '\n'.join(filtered_lines)
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r' {2,}', ' ', text)
        
        return text
    
    def process_pdf(self, pdf_path):
        """Process a single PDF: load and split into chunks."""
        documents = self.load_pdf(pdf_path)
        chunks = self.split_documents(documents)
        return chunks
    
    def process_directory(self, directory_path):
        """Process all PDFs in a directory: load and split into chunks."""
        documents = self.load_directory(directory_path)
        chunks = self.split_documents(documents)
        return chunks

In [12]:
doc_loader = DocumentLoader(output_dir="../data/processed")


In [13]:
raw_data_path = "../data/raw"
pdfs = os.listdir(raw_data_path)
splits = doc_loader.process_directory(raw_data_path)

Loading PDFs from ../data/raw...
Loaded 967 total pages from ../data/raw
Splitting 967 documents into chunks...
Created 4749 chunks


In [None]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(),
    persist_directory="../data/vectorstore",
)

In [10]:
retriever = vectorstore.as_retriever()

In [None]:
prompt = hub.pull("rag")