In [12]:
#!pip install jupyter_bokeh

In [1]:
import os
from datetime import datetime
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage, AIMessage
import panel as pn

## NOTE:
Below is a light implementation of RAG system that answers questions about private documents using OpenAI embeddings and ChromaDB vectorization. The user interface was inspired by the course as it uses runable dashboard: first user asks what is the name of the organization desribed in the private document? and then the user ask what is specific policy of this organization? Both times the chatbot gives correct answer as it first finds the name of the community described in the provided architectural standards, and afterwards finds that solar panels are not allowed in that particular community. Change code below to run locally, this is also followed by my requirements.

In [3]:
# Set OpenAI API Key
from apikey import apikey
os.environ['OPENAI_API_KEY'] = apikey

# Provide private document to do QA with
file_path = "Blue Ridge shadow architectural.pdf"
document_type = "foundational"
document_date = "2017-12-01"

In [6]:
!conda list

# packages in environment at /home/dima/anaconda3/envs/hoa_rag:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main  
_openmp_mutex             5.1                       1_gnu  
aiobotocore               2.12.3          py311h06a4308_0  
aiohappyeyeballs          2.4.0           py311h06a4308_0  
aiohttp                   3.10.5          py311h5eee18b_0  
aioitertools              0.7.1              pyhd3eb1b0_0  
aiosignal                 1.2.0              pyhd3eb1b0_0  
alabaster                 0.7.16          py311h06a4308_0  
altair                    5.0.1           py311h06a4308_0  
anaconda                  2024.10             py311_mkl_0  
annotated-types           0.7.0                    pypi_0    pypi
anyio                     4.2.0           py311h06a4308_0  
aom                       3.6.0                h6a678d5_0  
appdirs                   1.4.4              pyhd3eb1b0_0  
argon2-cffi          

In [4]:
# Initialize chat history
chat_history = []

def reset_database():
    if os.path.exists("./chroma_db"):
        import shutil
        shutil.rmtree("./chroma_db")
    print("Database reset successfully.")

def add_document(file_path, document_type, document_date):
    if file_path.endswith('.pdf'):
        loader = PyPDFLoader(file_path)
    else:
        loader = TextLoader(file_path)
    
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(documents)
    
    for split in splits:
        split.metadata['document_type'] = document_type
        split.metadata['document_date'] = document_date  # Store as string
        split.metadata['upload_time'] = datetime.now().isoformat()  # Store as ISO format string
    
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(splits, embeddings, persist_directory="./chroma_db")
    vectorstore.persist()
    print(f"Document '{file_path}' added successfully as {document_type}.")

def retrieve_relevant_documents(query, top_k=5):
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
    results = vectorstore.similarity_search_with_score(query, k=top_k)
    
    sorted_results = sorted(
        results, 
        key=lambda x: (x[0].metadata['document_date'], -x[1]), 
        reverse=True
    )
    return sorted_results

def answer_question(query, chat_history):
    relevant_docs = retrieve_relevant_documents(query)
    context = "\n\n".join([f"Document Type: {doc.metadata['document_type']}, Date: {doc.metadata['document_date']}\n{doc.page_content}" for doc, _ in relevant_docs])
    
    llm = ChatOpenAI(model_name="gpt-3.5-turbo")
    prompt_template = """
    Use the following context and chat history to answer the query. 
    If no specific information is found, say that you don't have enough information to answer.
    
    Context:
    {context}
    
    Chat History:
    {chat_history}
    
    Human: {question}
    AI Assistant:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "chat_history", "question"])
    chain = LLMChain(llm=llm, prompt=prompt)
    
    formatted_history = "\n".join([f"{'Human' if isinstance(msg, HumanMessage) else 'AI'}: {msg.content}" for msg in chat_history[-5:]])
    return chain.run(context=context, chat_history=formatted_history, question=query)

def collect_messages(_):
    prompt = inp.value
    inp.value = ''
    chat_history.append(HumanMessage(content=prompt))
    response = answer_question(prompt, chat_history)
    chat_history.append(AIMessage(content=response))
    return pn.Column(*[pn.Row(pn.pane.Markdown(f"**{'Human' if isinstance(msg, HumanMessage) else 'AI'}:** {msg.content}")) for msg in chat_history])

def clear_chat(_):
    global chat_history
    chat_history = []
    return pn.Column()



In [5]:
add_document(file_path, document_type, document_date)

  embeddings = OpenAIEmbeddings()


Document 'Blue Ridge shadow architectural.pdf' added successfully as foundational.


  vectorstore.persist()


In [9]:
# Create the chat interface
inp = pn.widgets.TextInput(value="", placeholder='Enter your question here…')
button_conversation = pn.widgets.Button(name="Ask")
button_clear = pn.widgets.Button(name="Clear Chat")

interactive_conversation = pn.bind(collect_messages, button_conversation)
interactive_clear = pn.bind(clear_chat, button_clear)

# Create the dashboard
dashboard = pn.Column(
    pn.pane.Markdown("# Document Q&A System"),
    inp,
    pn.Row(button_conversation, button_clear),
    pn.panel(interactive_conversation, loading_indicator=True, height=400),
    pn.panel(interactive_clear, loading_indicator=True),
)

pn.extension()

# Display the dashboard
dashboard.servable()  