In [3]:
!pip install langchain
!pip install langchain_community
!pip install langchain_core
!pip install langchain_openai
!pip install chromadb



In [2]:
OpenAI_key="your api key"

In [4]:
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader



In [5]:
# Loading a single website
loader = WebBaseLoader("http://www.paulgraham.com/wealth.html")
docs = loader.load()

# Split your website into big chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
chunks = text_splitter.split_documents(docs)

print (f"Your {len(docs)} documents have been split into {len(chunks)} chunks")

Your 1 documents have been split into 28 chunks


In [7]:
embedding = OpenAIEmbeddings(openai_api_key=OpenAI_key)
vectordb = Chroma.from_documents(documents=chunks, embedding=embedding)

Two retrievers to compare the outputs with each other:

Vanilla - Regular Top K Similarity Search   
MMR - Do a MMR search

In [8]:
retriever_vanilla = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 8})

retriever_mmr = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 8})

Let's go get the docs that come from the vanilla retriever

In [9]:
vanilla_relevant_docs = retriever_vanilla.get_relevant_documents("What is the best way to make and keep wealth?")

  warn_deprecated(


and the docs that come from the MMR retriever

In [10]:
mmr_relevant_docs = retriever_mmr.get_relevant_documents("What is the best way to make and keep wealth?")

This is a long winded function to help compare the two lists together

In [11]:
def analyze_list_overlap(list1, list2, content_attr='page_content'):
    """
    Analyze the overlap and uniqueness between two lists of objects using a specified content attribute.

    Parameters:
    list1 (list): The first list of objects to compare.
    list2 (list): The second list of objects to compare.
    content_attr (str): The attribute name of the content to use for comparison.

    Returns:
    dict: A dictionary with counts of overlapping, unique to list1, unique to list2 items,
          and total counts for each list.
    """
    # Extract unique content attributes from the lists
    set1_contents = {getattr(doc, content_attr) for doc in list1}
    set2_contents = {getattr(doc, content_attr) for doc in list2}

    # Find the number of overlapping content attributes
    overlap_contents = set1_contents & set2_contents
    overlap_count = len(overlap_contents)

    # Find the unique content attributes in each list
    unique_to_list1_contents = set1_contents - set2_contents
    unique_to_list2_contents = set2_contents - set1_contents
    unique_to_list1_count = len(unique_to_list1_contents)
    unique_to_list2_count = len(unique_to_list2_contents)

    # Use the unique content attributes to retrieve the unique objects
    unique_to_list1 = [doc for doc in list1 if getattr(doc, content_attr) in unique_to_list1_contents]
    unique_to_list2 = [doc for doc in list2 if getattr(doc, content_attr) in unique_to_list2_contents]

    # Count the total number of items in each list
    total_list1 = len(list1)
    total_list2 = len(list2)

    # Return the results in a dictionary
    return {
        'total_list1': total_list1,
        'total_list2': total_list2,
        'overlap_count': overlap_count,
        'unique_to_list1_count': unique_to_list1_count,
        'unique_to_list2_count': unique_to_list2_count,
    }

let's actually compare the lists and see what we have

In [12]:
analyze_list_overlap(vanilla_relevant_docs, mmr_relevant_docs)

{'total_list1': 8,
 'total_list2': 8,
 'overlap_count': 6,
 'unique_to_list1_count': 2,
 'unique_to_list2_count': 2}

If you were to inspect the 2 MMR docs which are different you would expect they would be more diverse than the ones returned by the vanilla retriever

### References

- [Allowed Search Types (LangChain)](https://github.com/langchain-ai/langchain/blob/60d025b83be4d4f884c67819904383ccd89cff87/libs/langchain/langchain/schema/vectorstore.py#L624)
- [Code example](https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/Retrieval_With_MMR.ipynb)