# Building Chroma DB for Retriever

This notebook is used to create vector data in Chroma DB for the retriever.

In [None]:
import copy
import glob
import os
import random
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [52]:
os.environ["USER_AGENT"] = "llm-retriever-and-tavily/0.1"

In [53]:
# Embeddings
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)
# Chunking strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=192, 
    chunk_overlap=32,
    separators=["\n\n", "\n"]
)

In [54]:
# Load documents
documents_path = "data/document/*"
filepaths = glob.glob(documents_path)
documents = []
for filepath in filepaths:
    print(f"Processing {filepath}")
    loader = PyPDFLoader(filepath)
    docs = loader.load()
    splits = text_splitter.split_documents(docs)
    documents.extend(splits)

Processing data/document\Harry Potter And The Deathly Hallows.pdf
Processing data/document\Solo Leveling Volume 01 - PDF Room.pdf
Processing data/document\Sword Art Online - Volume 01 - Aincrad.pdf
Processing data/document\The Complete Sherlock Holmes.pdf
Processing data/document\The Hunger Games (The Hunger Games, Book 1) - PDF Room.pdf


In [55]:
# Check the length of the documents
len(documents)

39142

In [56]:
# Print 5 random sample of the documents
sample_indexes = []
for i in range(5):
    p = random.randint(0, len(documents))
    sample_indexes.append(p)
    display(documents[p])

Document(metadata={'producer': '', 'creator': '', 'creationdate': '2014-03-15T14:16:40+01:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2014-03-15T14:16:40+01:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/MacPorts 2013_5) kpathsea version 6.1.1', 'source': 'data/document\\The Complete Sherlock Holmes.pdf', 'total_pages': 987, 'page': 929, 'page_label': '924'}, page_content='served as the local police-station. A walk of half a\nmile or so across a wind-swept heath, all gold and\nbronze with the fading ferns, brought us to a side-gate')

Document(metadata={'producer': '', 'creator': '', 'creationdate': '2014-03-15T14:16:40+01:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2014-03-15T14:16:40+01:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/MacPorts 2013_5) kpathsea version 6.1.1', 'source': 'data/document\\The Complete Sherlock Holmes.pdf', 'total_pages': 987, 'page': 926, 'page_label': '921'}, page_content='and it’s up to you to do it. Name your ﬁgure!”\n“My professional charges are upon a ﬁxed scale,”\nsaid Holmes coldly. “I do not vary them, save when I\nremit them altogether.”')

Document(metadata={'producer': 'pdfeTeX-1.21a', 'creator': 'Dark Miasma', 'creationdate': '2007-07-23T04:17:48-07:00', 'author': 'J. K. Rowling', 'title': 'Harry Potter and the Deathly Hallows', 'subject': '', 'keywords': '', 'ptex.fullbanner': 'This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) kpathsea version 3.5.4', 'source': 'data/document\\Harry Potter And The Deathly Hallows.pdf', 'total_pages': 768, 'page': 470, 'page_label': '463'}, page_content='way—“\n“HERMIONE! HERMIONE!”\n“We need a plan, stop yelling—we need to get these ropes oﬀ—\n“\n463')

Document(metadata={'producer': '', 'creator': '', 'creationdate': '2014-03-15T14:16:40+01:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2014-03-15T14:16:40+01:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/MacPorts 2013_5) kpathsea version 6.1.1', 'source': 'data/document\\The Complete Sherlock Holmes.pdf', 'total_pages': 987, 'page': 887, 'page_label': '882'}, page_content='The Adventure Of The Mazarin Stone\nHolmes shrugged his shoulders. “You can see in\nthe corner the parasol which you so politely handed\nto me in the Minories before you began to suspect.”')

Document(metadata={'producer': 'pdfeTeX-1.21a', 'creator': 'Dark Miasma', 'creationdate': '2007-07-23T04:17:48-07:00', 'author': 'J. K. Rowling', 'title': 'Harry Potter and the Deathly Hallows', 'subject': '', 'keywords': '', 'ptex.fullbanner': 'This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) kpathsea version 3.5.4', 'source': 'data/document\\Harry Potter And The Deathly Hallows.pdf', 'total_pages': 768, 'page': 593, 'page_label': '586'}, page_content='Cloak and throwing it over both of them. He gave the wall a little\npush.\nIt melted away at his touch and they slipped outside: Harry')

In [57]:
# Copy documents to a new list
documents_copy = copy.deepcopy(documents)

In [58]:
# Revert the documents to their original state
documents = copy.deepcopy(documents_copy)

In [59]:
# Sanitize the documents
for doc in documents:
    doc.page_content = doc.page_content.replace("-\n", "") # Remove -\n
    doc.page_content = doc.page_content.replace("”\n“", "”. “")
    doc.page_content = doc.page_content.replace(".\n", ". ")
    doc.page_content = doc.page_content.replace("\n", " ")

In [60]:
# View the sample after the changes
for i in sample_indexes:
    display(documents[i])

Document(metadata={'producer': '', 'creator': '', 'creationdate': '2014-03-15T14:16:40+01:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2014-03-15T14:16:40+01:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/MacPorts 2013_5) kpathsea version 6.1.1', 'source': 'data/document\\The Complete Sherlock Holmes.pdf', 'total_pages': 987, 'page': 929, 'page_label': '924'}, page_content='served as the local police-station. A walk of half a mile or so across a wind-swept heath, all gold and bronze with the fading ferns, brought us to a side-gate')

Document(metadata={'producer': '', 'creator': '', 'creationdate': '2014-03-15T14:16:40+01:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2014-03-15T14:16:40+01:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/MacPorts 2013_5) kpathsea version 6.1.1', 'source': 'data/document\\The Complete Sherlock Holmes.pdf', 'total_pages': 987, 'page': 926, 'page_label': '921'}, page_content='and it’s up to you to do it. Name your ﬁgure!”. “My professional charges are upon a ﬁxed scale,” said Holmes coldly. “I do not vary them, save when I remit them altogether.”')

Document(metadata={'producer': 'pdfeTeX-1.21a', 'creator': 'Dark Miasma', 'creationdate': '2007-07-23T04:17:48-07:00', 'author': 'J. K. Rowling', 'title': 'Harry Potter and the Deathly Hallows', 'subject': '', 'keywords': '', 'ptex.fullbanner': 'This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) kpathsea version 3.5.4', 'source': 'data/document\\Harry Potter And The Deathly Hallows.pdf', 'total_pages': 768, 'page': 470, 'page_label': '463'}, page_content='way—“ “HERMIONE! HERMIONE!”. “We need a plan, stop yelling—we need to get these ropes oﬀ— “ 463')

Document(metadata={'producer': '', 'creator': '', 'creationdate': '2014-03-15T14:16:40+01:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2014-03-15T14:16:40+01:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/MacPorts 2013_5) kpathsea version 6.1.1', 'source': 'data/document\\The Complete Sherlock Holmes.pdf', 'total_pages': 987, 'page': 887, 'page_label': '882'}, page_content='The Adventure Of The Mazarin Stone Holmes shrugged his shoulders. “You can see in the corner the parasol which you so politely handed to me in the Minories before you began to suspect.”')

Document(metadata={'producer': 'pdfeTeX-1.21a', 'creator': 'Dark Miasma', 'creationdate': '2007-07-23T04:17:48-07:00', 'author': 'J. K. Rowling', 'title': 'Harry Potter and the Deathly Hallows', 'subject': '', 'keywords': '', 'ptex.fullbanner': 'This is pdfeTeX, Version 3.141592-1.21a-2.2 (Web2C 7.5.4) kpathsea version 3.5.4', 'source': 'data/document\\Harry Potter And The Deathly Hallows.pdf', 'total_pages': 768, 'page': 593, 'page_label': '586'}, page_content='Cloak and throwing it over both of them. He gave the wall a little push. It melted away at his touch and they slipped outside: Harry')

In [61]:
# Delete the previous DB
db = Chroma(persist_directory="data/chroma_db", embedding_function=embeddings)
db.delete_collection()

# Create a new DB
db = Chroma.from_documents(documents, embeddings, persist_directory="data/chroma_db")

## Test Retriever

Running and testing multiple retriever to get the best and optimum solution.

In [62]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [63]:
retrieved_documents = retriever.invoke("Who is the main character in Sword Art Online?")
retrieved_documents

[Document(id='32d0bf0e-b75f-4374-97bc-66f17676e384', metadata={'author': 'Kadir', 'creationdate': '2016-09-09T14:43:10+02:00', 'creator': 'Microsoft® Word 2013', 'moddate': '2016-09-09T14:43:10+02:00', 'page': 57, 'page_label': '58', 'producer': 'Microsoft® Word 2013', 'source': 'data/document\\Sword Art Online - Volume 01 - Aincrad.pdf', 'total_pages': 293}, page_content='Sword Art Online Volume 1 – Aincrad     Chapter 4    58  The first person to get the honor of having his name crossed out  appeared three hours into the game.'),
 Document(id='4ea520c9-f3c5-45db-9224-53811786d2d4', metadata={'author': 'Kadir', 'creationdate': '2016-09-09T14:43:10+02:00', 'creator': 'Microsoft® Word 2013', 'moddate': '2016-09-09T14:43:10+02:00', 'page': 69, 'page_label': '70', 'producer': 'Microsoft® Word 2013', 'source': 'data/document\\Sword Art Online - Volume 01 - Aincrad.pdf', 'total_pages': 293}, page_content='Sword Art Online Volume 1 – Aincrad     Chapter 5    70  There were a lot of reasons, 

In [64]:
# Similarity search with score
results = db.similarity_search_with_score("Who is the main character in Sword Art Online?", k=5)
for doc, score in results:
    print(f"Score: {score} | Document: {doc}")

Score: 0.5166754722595215 | Document: page_content='Sword Art Online Volume 1 – Aincrad     Chapter 4    58  The first person to get the honor of having his name crossed out  appeared three hours into the game.' metadata={'author': 'Kadir', 'creationdate': '2016-09-09T14:43:10+02:00', 'creator': 'Microsoft® Word 2013', 'moddate': '2016-09-09T14:43:10+02:00', 'page': 57, 'page_label': '58', 'producer': 'Microsoft® Word 2013', 'source': 'data/document\\Sword Art Online - Volume 01 - Aincrad.pdf', 'total_pages': 293}
Score: 0.5584264397621155 | Document: page_content='Sword Art Online Volume 1 – Aincrad     Chapter 5    70  There were a lot of reasons, but the first was that she was one of  the very few female players, and that she was the owner of a face' metadata={'author': 'Kadir', 'creationdate': '2016-09-09T14:43:10+02:00', 'creator': 'Microsoft® Word 2013', 'moddate': '2016-09-09T14:43:10+02:00', 'page': 69, 'page_label': '70', 'producer': 'Microsoft® Word 2013', 'source': 'data/doc

In [65]:
# Define retriver using LLM
llm = ChatOllama(model="llama3:8b", temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=db.as_retriever(),
    llm=llm
)

In [66]:
retriever_from_llm.invoke("Who is the main character in Sword Art Online?")

[Document(id='d5538a9b-b2d8-4d1b-984e-f2b75bb7a43c', metadata={'author': 'Kadir', 'creationdate': '2016-09-09T14:43:10+02:00', 'creator': 'Microsoft® Word 2013', 'moddate': '2016-09-09T14:43:10+02:00', 'page': 277, 'page_label': '278', 'producer': 'Microsoft® Word 2013', 'source': 'data/document\\Sword Art Online - Volume 01 - Aincrad.pdf', 'total_pages': 293}, page_content="asking:  “...What about those who died? Both of us are already dead, yet we  continue to exist here. Doesn't that mean you can return the other"),
 Document(id='d9310032-a5d1-43d3-b693-3b20d82c7427', metadata={'author': '', 'creationdate': '2014-03-15T14:16:40+01:00', 'creator': '', 'keywords': '', 'moddate': '2014-03-15T14:16:40+01:00', 'page': 197, 'page_label': '192', 'producer': '', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/MacPorts 2013_5) kpathsea version 6.1.1', 'source': 'data/document\\The Complete Sherlock Holmes.pdf', 'subject': '', 'title': '', 'total_pages': 987, 