In [2]:
import os
import json

In [25]:
keys = json.load(open("../config/keys.json"))

In [26]:
os.environ['LANGCHAIN_TRACING'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = keys['LANGCHAIN_API_KEY']
os.environ['LANGSMITH_PROJECT'] = "RuleBookAssistant"
os.environ['OPENAI_API_KEY'] = keys['OPENAI_API_KEY']

In [42]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from openai import OpenAI
from langsmith.wrappers import wrap_openai
import re

In [11]:
class DocumentLoader:
    """Handles loading and processing D&D PDF rulebooks using LangChain."""
    
    def __init__(self, output_dir):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        # Configure text splitter for chunking
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            is_separator_regex=False,
        )
    
    def load_pdf(self, pdf_path):
        """Load a single PDF file using LangChain's loader."""
        print(f"Loading {pdf_path}...")
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        # Add source metadata
        source_name = os.path.basename(pdf_path)
        for doc in documents:
            doc.metadata["source"] = source_name
            # Clean up potential page headers/footers
            doc.page_content = self._clean_text(doc.page_content)
        
        print(f"Loaded {len(documents)} pages from {source_name}")
        return documents
    
    def load_directory(self, directory_path, glob_pattern="**/*.pdf"):
        """Load all PDFs in a directory using LangChain's DirectoryLoader."""
        print(f"Loading PDFs from {directory_path}...")
        loader = DirectoryLoader(directory_path, glob=glob_pattern, loader_cls=PyPDFLoader)
        documents = loader.load()
        
        # Add additional metadata and clean text
        for doc in documents:
            # Clean text
            doc.page_content = self._clean_text(doc.page_content)
        
        print(f"Loaded {len(documents)} total pages from {directory_path}")
        return documents
    
    def split_documents(self, documents):
        """Split documents into chunks using the text splitter."""
        print(f"Splitting {len(documents)} documents into chunks...")
        chunks = self.text_splitter.split_documents(documents)
        print(f"Created {len(chunks)} chunks")
        return chunks
    
    def _clean_text(self, text):
        """Clean and normalize extracted text."""
        # Remove headers/footers
        lines = text.split('\n')
        filtered_lines = [line for line in lines if len(line.strip()) > 1]
        
        # Remove excessive whitespace
        text = '\n'.join(filtered_lines)
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r' {2,}', ' ', text)
        
        return text
    
    def process_pdf(self, pdf_path):
        """Process a single PDF: load and split into chunks."""
        documents = self.load_pdf(pdf_path)
        chunks = self.split_documents(documents)
        return chunks
    
    def process_directory(self, directory_path):
        """Process all PDFs in a directory: load and split into chunks."""
        documents = self.load_directory(directory_path)
        chunks = self.split_documents(documents)
        return chunks

In [12]:
doc_loader = DocumentLoader(output_dir="../data/processed")


In [13]:
raw_data_path = "../data/raw"
pdfs = os.listdir(raw_data_path)
splits = doc_loader.process_directory(raw_data_path)

Loading PDFs from ../data/raw...
Loaded 967 total pages from ../data/raw
Splitting 967 documents into chunks...
Created 4749 chunks


In [14]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(),
    persist_directory="../data/vectorstore",
)

In [28]:
retriever = vectorstore.as_retriever()

In [46]:
# prompt = hub.pull("RuleBookAssistant")
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Please respond to the user's request only based on the given context. Give references and page numbers where applicable."),
    ("user", "Question: {question}\nContext: {context}")
])
model = ChatOpenAI(model="gpt-4o-mini")
output_parser = StrOutputParser()

In [None]:
# Chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [48]:
rag_chain.invoke("What some different locations campaigns can take place?")

"Campaigns can take place in various locations, each serving unique narrative and gameplay purposes. Here are some examples drawn from the context:\n\n1. **Towns or Cities**: A campaign can center on a particular town or city, which serves as a home base for the characters. This setting can foster emotional ties, especially if the players have NPCs they care about in that location (page 15).\n\n2. **Dungeons**: Starting a campaign in the dungeons of an evil baron's castle is another option. This setting can thrust characters immediately into the action and adventure (page 26).\n\n3. **Villages or Wilderness Edges**: A village at the edge of the wilderness can serve as a home base, particularly in campaigns that explore more rural or natural themes (page 25).\n\n4. **Regional Landscapes**: Filling in a local region around the home base, including dungeons or adventure locales, can enrich the campaign experience. This approach allows for exploration of nearby settlements and wilderness (

In [52]:
rag_chain.invoke("How does resting work?")

"Resting in the context of adventuring is crucial for characters to recover and maintain their effectiveness. There are two primary types of rests: short rests and long rests.\n\n### Short Rest\nA short rest is a period of downtime lasting at least 1 hour. During this time, characters can eat, drink, read, and tend to their wounds but cannot engage in strenuous activities. At the end of a short rest, characters can spend Hit Dice (up to their maximum, which equals their character level) to regain hit points. Each Hit Die spent allows the player to roll the die and add their Constitution modifier to the total, thus regaining hit points. Characters can choose to spend additional Hit Dice after each roll, up to their limit.\n\n### Long Rest\nA long rest requires at least 8 hours of downtime, during which a character can sleep or engage in light activities such as reading or talking but cannot perform strenuous activities for more than 2 hours. If a long rest is interrupted by strenuous ac

In [55]:
rag_chain.invoke("How can a player surprise and enemy? What does that do?")

"A player can surprise an enemy by utilizing stealth tactics, for instance, by hiding and sneaking up on them without being detected. The Dungeon Master (DM) determines if a surprise occurs during a combat encounter based on the stealth checks of the players compared to the passive perception of the enemies. If successful, the surprised side does not get to take actions during the first round of combat, which can provide the players with a significant tactical advantage (Player's Handbook, page 172).\n\nSurprising an enemy can lead to several benefits, such as gaining the initiative in combat, allowing the surprise-attacking players to act first and potentially eliminate threats before they can respond. Additionally, it can disrupt the enemy's plans and instill fear, potentially causing them to falter or react unpredictably (Player's Handbook, page 173). \n\nOverall, employing strategies to gain surprise can give players both a narrative and mechanical edge in their encounters."