In [1]:
import dotenv

dotenv.load_dotenv()

True

In this document I will explore difference kinds of Vector Retriever in Langchain. I will try the vanilla version `Vectorstore`, then `MultiVectorRetriever` using smaller chunks embedding which is similar to `ParentDocumentRetriever`, as well as summary embedding. I will use the some query to measure the performance

In [2]:
import bs4
from langchain import hub
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain_core.prompts import ChatPromptTemplate

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

## Load pages and split into chunks

In [4]:
pages = ["https://zylo.com/blog/guide-saas-renewal/", "https://zylo.com/blog/saas-management/"]

In [5]:
loader = WebBaseLoader(
    web_paths=pages,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("hero__text-section-container col col-12 col-lg-6", "site-main")
        )
    ),
)
docs = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(docs)

In [7]:
len(docs)

76

# Vanilla retriever: vectorstore

In [8]:
vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [9]:
retrieved_docs = retriever.invoke("what is saas?")

In [10]:
retrieved_docs[0]

Document(page_content='SaaS is hosted in the cloud and owned by the vendor/supplier.\xa0\nLicenses are by subscription, giving you flexibility in deploying and scaling your software across the business and allowing for shorter-term commitments.\nIT is no longer the sole buyer of SaaS – everyone at the company purchases it, often on credit cards. This decentralized purchasing leads to duplicate purchases and app redundancies that waste money and hinder efficiency.', metadata={'source': 'https://zylo.com/blog/saas-management/'})

## MultiVectorRetriever with smaller chunks embedding

In [11]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
    search_kwargs = {"k":10}
)
import uuid

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [12]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

In [13]:
sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [14]:
len(sub_docs)

226

In [15]:
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [16]:
retriever.vectorstore.similarity_search("what is saas?")[0]

Document(page_content='SaaS is hosted in the cloud and owned by the vendor/supplier.\xa0\nLicenses are by subscription, giving you flexibility in deploying and scaling your software across the business and allowing for shorter-term commitments.', metadata={'doc_id': '1333a544-3772-49aa-a20c-a79dcf648bf7', 'source': 'https://zylo.com/blog/saas-management/'})

In [17]:
parent_retrieved_docs = retriever.invoke("what is saas?")

In [18]:
parent_retrieved_docs[0]

Document(page_content='SaaS is hosted in the cloud and owned by the vendor/supplier.\xa0\nLicenses are by subscription, giving you flexibility in deploying and scaling your software across the business and allowing for shorter-term commitments.\nIT is no longer the sole buyer of SaaS – everyone at the company purchases it, often on credit cards. This decentralized purchasing leads to duplicate purchases and app redundancies that waste money and hinder efficiency.', metadata={'source': 'https://zylo.com/blog/saas-management/'})

# MultiVectorRetriever with summary embeddings

In [19]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)

In [20]:
summaries = chain.batch(docs, {"max_concurrency": 5})

In [21]:
len(summaries)

76

In [22]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
    search_kwargs = {"k":10}
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [23]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

In [24]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [25]:
vectorstore.similarity_search("what is saas?")

[Document(page_content='The document explains that Software as a Service (SaaS) is hosted in the cloud and owned by the vendor, allowing for flexibility in deployment and scaling. Licenses are obtained through subscription, allowing for shorter-term commitments. It notes that purchasing of SaaS has become decentralized within companies, leading to duplicate purchases and inefficiencies.', metadata={'doc_id': '1c384c24-b623-42be-b400-815e5d4c2c49'}),
 Document(page_content="The document discusses the importance of managing Software as a Service (SaaS) applications in modern businesses. It defines SaaS Management as the practice of proactively managing, optimizing, and governing all SaaS applications within a company's portfolio. The document highlights the significance, benefits, and impact of SaaS Management on organizations in the modern era.", metadata={'doc_id': 'da31631a-02b5-4708-80dc-84d42636cbb6'}),
 Document(page_content='The document discusses different terms for the process o

In [27]:
summary_retrieved_docs = retriever.invoke("what is saas?")

In [28]:
summary_retrieved_docs[0]

Document(page_content='SaaS is hosted in the cloud and owned by the vendor/supplier.\xa0\nLicenses are by subscription, giving you flexibility in deploying and scaling your software across the business and allowing for shorter-term commitments.\nIT is no longer the sole buyer of SaaS – everyone at the company purchases it, often on credit cards. This decentralized purchasing leads to duplicate purchases and app redundancies that waste money and hinder efficiency.', metadata={'source': 'https://zylo.com/blog/saas-management/'})