In [5]:
refresh_embeddings = False
refresh_docs = False

In [6]:
from langchain.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

if refresh_docs:
    eberron_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
    eberron_loader = PDFMinerLoader("/workspaces/documents/Eberron_last_war.pdf")

    eberron_data = eberron_loader.load_and_split(text_splitter=eberron_splitter)

    tt_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=750)
    tt_loader = PDFMinerLoader("/workspaces/documents/Guide_to_Twisted_Taverns.pdf")

    tt_data = tt_loader.load_and_split(text_splitter=tt_splitter)

In [7]:
from langchain_community.vectorstores import LanceDB
from langchain.embeddings.openai import OpenAIEmbeddings
import lancedb

if refresh_embeddings:

    embeddings = OpenAIEmbeddings()

    all_sourcebooks = eberron_data + tt_data

    db = lancedb.connect("/workspaces/llm-dungeon-master/db")
    table = db.create_table(
        "sourcebooks",
        data=[
            {
                "vector": embeddings.embed_query("Hello World"),
                "text": "Hello World",
                "id": "1",
            }
        ],
        mode="overwrite",
    )

    # Load the document, split it into chunks, embed each chunk and load it into the vector store.
    vectorstore = LanceDB.from_documents(all_sourcebooks, OpenAIEmbeddings(), connection=table)

In [14]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

if not refresh_embeddings:
    embedding_function = OpenAIEmbeddings()


    db = lancedb.connect('/workspaces/llm-dungeon-master/db')
    table = db.open_table('sourcebooks')
    vectorstore = LanceDB(table, embedding_function)
    
# Create a retriever that fetches documents from multiple tables
lance_retriever = vectorstore.as_retriever()

template = """Answer the question based only on the following context:
{context}.

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

chain = (
    {"context": lance_retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [15]:
chain.invoke("Who is Rimak chaska?")

"Rimak Chaska is a male high elf bard who is posing as a wealthy noble. He is the leader of the Adventure Belles, a bardic troupe seeking revenge against the Half'n'Halfs for winning a posting at The Wildaback tavern. Rimak spends most of his time in the Main Hall and is always present during performances. He has a Charisma score of 16 and can cast various bard spells."

In [None]:
import os

def get_folder_size(folder):
    total = 0
    for path, dirs, files in os.walk(folder):
        for f in files:
            fp = os.path.join(path, f)
            total += os.path.getsize(fp)
    return total

# Usage
folder_size = get_folder_size("/workspaces/llm-dungeon-master/db")
print(f"The size of the folder is {folder_size} bytes.")

The size of the folder is 16513700 bytes.
