# Running a basic RAG-powered LLM application using Mistral

**parse.py**: defines find_documents() and load_documents(), which locates and stores each file in a way that can be used by a RAG system.

This notebook loads all the information from files in the chosen directory into a Chroma DB collection. The files are divided according to the default parameters used by the loaders in load_documents. A query is used to extract information from the collection, and the query, context, and some extra information are combined into an LLM prompt that Mistral uses to respond.

Areas of improvement: 
1. change parameters by which documents are split upon loading (done in parse.py load_documents())
2. integrate an embedding model when documents are added to the Chroma collection (this file)
3. use a pipeline to make the retrieval multi-step or metadata-aware (this file-- but will likely require a lot of code that may end up in other files as well)


In [None]:
from parse import find_documents, load_documents
import chromadb
from chromadb.config import Settings
from langchain_community.llms import Ollama
from embedding_util import CustomEmbeddingFunction

In [None]:
NUM_DOCUMENTS = 2
THRESHOLD = 99999
llm = Ollama(model="llama2")
TARGET_DIR = 'SOURCE_DIRECTORY'

NOTE cell below: produces "ignoring wrote pointing object at x y (offset z)" message -- I believe this is coming from the pyPDFLoader functioncall in load_documents. Cause? Is it an issue or can we just leave it there?

In [None]:
paths, filenames = find_documents(TARGET_DIR)
documents = load_documents(paths, filenames)

len(documents)

usually takes 10-15 minutes to run when the thermo textbook is being processed

In [None]:
#DO NOT RUN
try:
    #client.reset()
    pass
finally:
    client = chromadb.Client(Settings(allow_reset=True))

db = client.create_collection("newcoll")

db.add(
          ids = [str(i) for i in range(0, len(documents))],
    documents = [doc.page_content for doc in documents], 
    metadatas = [doc.metadata for doc in documents]
)

In [None]:
if client:
    client.reset()
else:
    pass

client = chromadb.Client(Settings(allow_reset=True))
db = client.get_or_create_collection(
    name = 'test', embedding_function=CustomEmbeddingFunction()
)

db.add(
    ids = [str(i) for i in range(0, len(documents))],
    documents = [doc.page_content for doc in documents], 
    metadatas = [doc.metadata for doc in documents]
)

In [None]:
#DO NOT RUN

def create_prompt(context, question):
    system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
    Read the given context before answering questions and think step by step. If you can not answer a user question based on 
    the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
    B_INST, E_INST = "<s>[INST] ", " [/INST]"
    
    prompt_template = (
    B_INST
    + system_prompt
    + f"""
            
    Context: {context}
    User: {question}"""
        + E_INST
        + """\n\nFinally, if any of the context sources supplied to you were useful, list each source like this:
                [path to source 1], [path to source 2] etc."""
            )
    #prompt = PromptTemplate(input_variables=['context', 'question'], template=prompt_template)

    return (prompt_template)

In [None]:
'''
This version of the function makes the LLM respond with a narrower answer that is more likely to directly quote the source and list
the specific file path and sometimes even section/page number of the source. However, the response is also less robust and doesn't work as
well when you give it more than 1 source. The more sources you provide, the less likely it seems to actually cite each source.
'''
def create_prompt(context, question):
    str = f"""
    
    You are a helpful assistant that will use some provided context to answer the following question. Before you answer, read the context and think
    about how it relates to and answers the question. If you can't answer a question based on the context, simply state that you could not find any useful 
    information to help answer. Do not use any other information besides the provided context.

    {context}
    User:{question}

    Use this format:
    [Filepath] : 
    [information learned from source]

    Thank you!
    """

    return str

In [None]:
def create_prompt(context, question):
    str = f"""
    
    You are a helpful assistant that will use some provided context to answer the following question. Before you answer, read the context and think about how it relates to the question. You may be provided with one or several filepaths containing context. If any of the context is relevant, make sure you tell me where it came from-- this may be in the form of a file path, a chapter, or page number. However, if the context is not relevant, prioritize answering the question fully and acknowledge that the provided information was not helpful.

    {context}
    User:{question}

    Please respond by telling me what information you found, where it came from, and then use it to answer the question. 

    Thank you!
    """

    return str

In [None]:
def print_recieved_documents(document_list):
    #print(f"LENGTH IS : {len(document_list['metadatas'][0])}")
    #print(f"LENGTH IS : {len(document_list['documents'][0])}")
    for idx, _ in enumerate(document_list['ids'][0]):
        print('************')
        print(f"Filepath: {document_list['metadatas'][0][idx]}")
        #print(f"distance: {document_list['distances'][0][idx]}")
        print(f"Content: {document_list['documents'][0][idx]}")
        print('************')

In [None]:
# code to view the prompt
#temp = db.query(query_texts='What is the mass of a proton?', n_results=3)
#print_recieved_documents(temp)

#print(create_prompt(temp, 'What is the mass of a proton?'))

In [None]:
def get_llm_response(question, show_context = True):
    results = db.query(query_texts = question, n_results = NUM_DOCUMENTS)
    if show_context:
        print_recieved_documents(results)
    print("SELECTED EXCERPTS:")
    for idx, src in enumerate(results['metadatas'][0]):
        print(f'    - {src}')
    print()

    relevant_docs = []

    for idx, dist in enumerate(results['distances'][0]):
        if dist < THRESHOLD:
            this_doc = [ results['metadatas'][0][idx], results['documents'][0][idx]]
            relevant_docs.append(this_doc)

    response = llm.invoke(create_prompt(relevant_docs, question))
    print(response)

In [None]:
QUERY = "What is temperature? Can an individual particle have a temperature?"

In [None]:
print(QUERY)
print()
print("MISTRAL WITH RAG:")
print()
get_llm_response(QUERY, show_context = False)
#get_llm_response(QUERY)
print()
print("MISTRAL:")
print(llm.invoke(QUERY))