# LangChain - EnsembleRetriever - Experiments

by Adam Lang
Date: February 12, 2024

In [7]:
#install libraries
! pip install langchain --q
! pip install chromadb --q
! pip install rank_bm25 --q
! pip install sentence_transformers lark --quiet #for creating embeddings

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/509.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m256.0/509.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.1/41.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [8]:
# import langchain dependencies
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import Chroma

# Create a list to search
* lets use a simple topic from clinical history taking - `smoking history`

In [48]:
#create a list to search
doc_list = [
    "Does the patient smoke",
    "Patient uses tobacco once a month",
    "How many cigarettes per day?",
    "Do they have a history of smoking?",
    "Is the patient exposed to smoke at home",
    "Does he chew tobacco",
    "Does she use marijuana?",
    "Has he ever smoked weed",
    "How many pack years of cigarettes?",
    "Are they smoking tobacco products",
    "He smokes 10 pipes per day",
    "She likes to have half a pack of cigarettes every other day",
    "He smokes weed once a week",
    "He hasn't smoked in 1 year",
]

# BM25 keyword retriever

In [49]:
#initialize
bm25_retriever = BM25Retriever.from_texts(doc_list)
bm25_retriever.k = 3 #retrieves top 2 documents as top k

In [50]:
# keyword search
bm25_retriever.get_relevant_documents("smoking")

[Document(page_content='Are they smoking tobacco products'),
 Document(page_content="He hasn't smoked in 1 year"),
 Document(page_content='He smokes weed once a week')]

summary: We can see semantically the BM25 retriever which uses the TF-IDF algorithm returned 2 phrases semantically similar to the company "Apple".

In [51]:
# try with an outlier term
bm25_retriever.get_relevant_documents("marijuana")

[Document(page_content="He hasn't smoked in 1 year"),
 Document(page_content='He smokes weed once a week'),
 Document(page_content='She likes to have half a pack of cigarettes every other day')]

summary: results are irrelevant

# BGE embeddings - semantic/vector search



In [52]:
#import bge embeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [53]:
# instantiate BGEembeddings model

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} #set to True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


In [54]:
#import chroma
from langchain.vectorstores import Chroma


### Creating Chroma vector database, passing doc list and embeddings

In [55]:
#load embeddings into Chroma - pass docs, embedding function and path of the db
db = Chroma.from_texts(doc_list,
                       embedding=embeddings,
                       persist_directory='./db')

### Vector search and retrieve top 2 documents

In [56]:
db_retriever = db.as_retriever(search_kwargs={"k":3})

In [57]:
db_retriever.get_relevant_documents("marijuana") #use keyword to search

[Document(page_content='Does she use marijuana?'),
 Document(page_content='Does she use marijuana?'),
 Document(page_content='Has he ever smoked weed')]

summary: we can see the output is more relevant when we use these combined methods of keyword and semantic search

# LangChain Ensemble Retriever
* Weights randomly initialized below are based on:
1. bm25_retriever (first)
2. db_retriever (second)

* These are purely based on the previous testing above where the db_retriever has better performance

In [58]:
#initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,
                                                   db_retriever],
                                       weights=[0.4, 0.6])

### Hybrid Search

In [63]:
#query 1
docs = ensemble_retriever.get_relevant_documents("Does he smoke more than once a day?")
docs

[Document(page_content='He smokes weed once a week'),
 Document(page_content='He smokes 10 pipes per day'),
 Document(page_content='She likes to have half a pack of cigarettes every other day'),
 Document(page_content='Does he chew tobacco'),
 Document(page_content='Does the patient smoke')]

In [64]:
#query 2
docs = ensemble_retriever.get_relevant_documents("cigarette")
docs

[Document(page_content='How many cigarettes per day?'),
 Document(page_content='How many pack years of cigarettes?'),
 Document(page_content="He hasn't smoked in 1 year"),
 Document(page_content='He smokes weed once a week'),
 Document(page_content='She likes to have half a pack of cigarettes every other day')]