In [None]:
!pip -q install langchain huggingface_hub openai google-search-results tiktoken chromadb rank_bm25 faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [None]:
!pip show langchain

Name: langchain
Version: 0.0.305
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, anyio, async-timeout, dataclasses-json, jsonpatch, langsmith, numexpr, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


## BM25 Retriever - Sparse retriever

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document

from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()


In [None]:
doc_list = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
    "I like computers by Apple",
    "I love fruit juice"
]

In [None]:
# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(doc_list)
bm25_retriever.k = 2

In [None]:
bm25_retriever.get_relevant_documents("Apple")

[Document(page_content='I like computers by Apple'),
 Document(page_content='I love fruit juice')]

In [None]:
bm25_retriever.get_relevant_documents("a green fruit")

[Document(page_content='I love fruit juice'),
 Document(page_content='I like computers by Apple')]

In [None]:
bm25_retriever.dict

<bound method BaseModel.dict of BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7a216882f0a0>, docs=[Document(page_content='I like apples'), Document(page_content='I like oranges'), Document(page_content='Apples and oranges are fruits'), Document(page_content='I like computers by Apple'), Document(page_content='I love fruit juice')], k=2)>

## Embeddings - Dense retrievers FAISS

In [None]:
faiss_vectorstore = FAISS.from_texts(doc_list, embedding)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

In [None]:
faiss_retriever.get_relevant_documents("A green fruit")

[Document(page_content='Apples and oranges are fruits'),
 Document(page_content='I like apples')]

## Ensemble Retriever

In [None]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
                                       weights=[0.5, 0.5])

In [None]:
docs = ensemble_retriever.get_relevant_documents("A green fruit")
docs

[Document(page_content='I love fruit juice'),
 Document(page_content='Apples and oranges are fruits'),
 Document(page_content='I like apples'),
 Document(page_content='I like computers by Apple')]

In [None]:
docs = ensemble_retriever.get_relevant_documents("Apple Phones")
docs

[Document(page_content='I like computers by Apple'),
 Document(page_content='I like apples'),
 Document(page_content='I love fruit juice')]