In [1]:
import os
from pathlib import Path
import glob
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import fitz

In [37]:
MODEL = "gpt-4o-mini"
db_name = "chroma_db"
folder_path = "PDFs"

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [41]:
def pdf_to_text_chunks(folder_path, chunk_size=1000, chunk_overlap=100):
    all_chunks = []
    splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text()

            chunks = splitter.split_text(text)
            for chunk in chunks:
                all_chunks.append({"text": chunk, "source": filename})

    return all_chunks

In [42]:
def store_chunks_in_chroma(chunks, persist_directory=db_name):
    documents = [
        Document(page_content=chunk["text"], metadata={"source": chunk["source"]})
        for chunk in chunks
    ]

    vectorstore = Chroma.from_documents(
        documents,
        embedding=OpenAIEmbeddings(),
        persist_directory=persist_directory
    )

    if hasattr(vectorstore, "persist"):
        vectorstore.persist()

    print(f"Stored {len(documents)} documents in Chroma.")

In [59]:
embeddings = OpenAIEmbeddings()

def process_pdfs_to_chroma(folder_path=folder_path, persist_directory=db_name):
    if os.path.exists(db_name):
        Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    
    chunks = pdf_to_text_chunks(folder_path)
    store_chunks_in_chroma(chunks, persist_directory=persist_directory)

In [60]:
process_pdfs_to_chroma(folder_path, db_name)

Stored 82 documents in Chroma.


In [61]:
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [62]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [63]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7871
* To create a public link, set `share=True` in `launch()`.
